diff options
Diffstat (limited to 'libraries/atlas/atlas.patch')
-rw-r--r-- | libraries/atlas/atlas.patch | 5072 |
1 files changed, 5072 insertions, 0 deletions
diff --git a/libraries/atlas/atlas.patch b/libraries/atlas/atlas.patch new file mode 100644 index 0000000000000..dea4dcc0b2eeb --- /dev/null +++ b/libraries/atlas/atlas.patch @@ -0,0 +1,5072 @@ +diff -rupN ATLAS/CONFIG/src/backend/archinfo_x86.c atlas-3.8.3/CONFIG/src/backend/archinfo_x86.c +--- ATLAS/CONFIG/src/backend/archinfo_x86.c 2009-02-18 19:47:37.000000000 +0100 ++++ atlas-3.8.3/CONFIG/src/backend/archinfo_x86.c 2009-11-12 13:47:23.777451677 +0100 +@@ -320,7 +320,7 @@ enum MACHTYPE Chip2Mach(enum CHIP chip, + iret = IntP4; + break; + case 3: +- case 4: ++ case 4: ; case 6: + iret = IntP4E; + break; + default: +diff -rupN ATLAS/include/atlas_lvl3.h atlas-3.8.3/include/atlas_lvl3.h +--- ATLAS/include/atlas_lvl3.h 2009-02-18 19:47:35.000000000 +0100 ++++ atlas-3.8.3/include/atlas_lvl3.h 2009-11-12 13:52:49.308496090 +0100 +@@ -126,7 +126,7 @@ + #define CPAT Mjoin(C_ATL_, PRE); + + #ifndef ATL_MaxMalloc +- #define ATL_MaxMalloc 67108864 ++ #define ATL_MaxMalloc XXX_MaxMalloc_XXX + #endif + + typedef void (*MAT2BLK)(int, int, const TYPE*, int, TYPE*, const SCALAR); +diff -rupN ATLAS/src/blas/gemm/ATL_cmmJITcp.c atlas-3.8.3/src/blas/gemm/ATL_cmmJITcp.c +--- ATLAS/src/blas/gemm/ATL_cmmJITcp.c 2009-02-18 19:47:44.000000000 +0100 ++++ atlas-3.8.3/src/blas/gemm/ATL_cmmJITcp.c 2009-11-12 12:44:34.816529051 +0100 +@@ -268,7 +268,8 @@ static void Mjoin(PATL,mmK) + { + NBmm0 = NBmm1 = NBmmX = Mjoin(PATLU,pKBmm); + if (SCALAR_IS_ZERO(beta)) +- Mjoin(PATL,gezero)(M, N, C, ldc); ++ /* Mjoin(PATL,gezero)(M, N, C, ldc); */ ++ { Mjoin(PATLU,gezero)(M, N, pC, ldpc); Mjoin(PATLU,gezero)(M, N, pC+ipc, ldpc); } + } + if (nblk) + { +diff -rupN ATLAS/src/blas/gemm/ATL_gereal2cplx.c atlas-3.8.3/src/blas/gemm/ATL_gereal2cplx.c +--- ATLAS/src/blas/gemm/ATL_gereal2cplx.c 2009-02-18 19:47:44.000000000 +0100 ++++ atlas-3.8.3/src/blas/gemm/ATL_gereal2cplx.c 2009-11-12 12:49:49.331651677 +0100 +@@ -43,7 +43,53 @@ void Mjoin(PATL,gereal2cplx) + const int ldc2 = (ldc-M)<<1; + int i, j; + +- if (ialp == ATL_rzero && ibet == ATL_rzero) ++/* ++ * Cannot read C if BETA is 0 ++ */ ++ if (rbet == ATL_rzero && ibet == ATL_rzero) ++ { ++ if (ialp == ATL_rzero) /* alpha is a real number */ ++ { ++ if (ralp == ATL_rone) /* alpha = 1.0 */ ++ { ++ for (j=0; j < N; j++, R += ldr, I += ldi, C += ldc2) ++ { ++ for (i=0; i < M; i++, C += 2) ++ { ++ *C = R[i]; ++ C[1] = I[i]; ++ } ++ } ++ } ++ else ++ { ++ for (j=0; j < N; j++, R += ldr, I += ldi, C += ldc2) ++ { ++ for (i=0; i < M; i++, C += 2) ++ { ++ *C = ralp * R[i]; ++ C[1] = ralp * I[i]; ++ } ++ } ++ } ++ } ++ else /* alpha is a complex number */ ++ { ++ for (j=0; j < N; j++, R += ldr, I += ldi, C += ldc2) ++ { ++ for (i=0; i < M; i++, C += 2) ++ { ++ ra = R[i]; ia = I[i]; ++ C[0] = ralp * ra - ialp * ia; ++ C[1] = ralp * ia + ialp * ra; ++ } ++ } ++ } ++ } ++/* ++ * If alpha and beta are both real numbers ++ */ ++ else if (ialp == ATL_rzero && ibet == ATL_rzero) + { + if (ralp == ATL_rone && rbet == ATL_rone) + { +diff -rupN ATLAS/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c atlas-3.8.3/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c +--- ATLAS/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c 2009-02-18 19:48:26.000000000 +0100 ++++ atlas-3.8.3/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c 2009-11-12 12:35:50.453038827 +0100 +@@ -27,6 +27,13 @@ + * POSSIBILITY OF SUCH DAMAGE. + * + */ ++#if KB > 84 ++ #error "KB cannot exceed 84!" ++#endif ++#if (KB/4)*4 != KB ++ #error "KB must be a multiple of 4!" ++#endif ++ + #ifndef ATL_GAS_x8664 + #error "This kernel requires x86-64 assembly!" + #endif +@@ -58,25 +65,25 @@ + * Integer register usage shown be these defines + */ + #define pA %rcx +-#define pA10 %rbx +-#define ldab %rbp +-#define mldab %rdx ++#define pA10 %rbx ++#define ldab %rbp ++#define mldab %rdx + #define mldab5 %rax + #define pB %rdi + #define pC %rsi + #define incCn %r10 + #define stM %r9 + #define stN %r11 +-#define pfA %r8 +-#define pA5 pA +-#define pB0 pB ++#define pfA %r8 ++#define pA5 pA ++#define pB0 pB + #if MB == 0 +- #define stM0 %r12 +- #define incAm %r13 ++ #define stM0 %r12 ++ #define incAm %r13 + #endif + /* rax used in 32/64 conversion */ + +-#define NBso (KB*4) ++#define NBso (KB*4) + #define MBKBso (MB*KB*4) + #define NB2so (NBso+NBso) + #define NB3so (NBso+NBso+NBso) +@@ -95,22 +102,22 @@ + /* + * SSE2 register usage shown be these defines + */ +-#define rA0 %xmm0 +-#define rB0 %xmm1 +-#define rC0 %xmm2 +-#define rC1 %xmm3 +-#define rC2 %xmm4 +-#define rC3 %xmm5 +-#define rC4 %xmm6 +-#define rC5 %xmm7 +-#define rC6 %xmm8 +-#define rC7 %xmm9 +-#define rC8 %xmm10 +-#define rC9 %xmm11 +-#define rC10 %xmm12 +-#define rC11 %xmm13 +-#define rC12 %xmm14 +-#define rC13 %xmm15 ++#define rA0 %xmm0 ++#define rB0 %xmm1 ++#define rC0 %xmm2 ++#define rC1 %xmm3 ++#define rC2 %xmm4 ++#define rC3 %xmm5 ++#define rC4 %xmm6 ++#define rC5 %xmm7 ++#define rC6 %xmm8 ++#define rC7 %xmm9 ++#define rC8 %xmm10 ++#define rC9 %xmm11 ++#define rC10 %xmm12 ++#define rC11 %xmm13 ++#define rC12 %xmm14 ++#define rC13 %xmm15 + /* + * Prefetch defines + */ +@@ -127,99 +134,99 @@ + #if MB != 0 + #define incAm $MBKBso-NB14so+176 + #endif +- .text ++ .text + .global ATL_asmdecor(ATL_USERMM) + ATL_asmdecor(ATL_USERMM): + /* + * Save callee-saved iregs + */ +- movq %rbp, -8(%rsp) +- movq %rbx, -16(%rsp) ++ movq %rbp, -8(%rsp) ++ movq %rbx, -16(%rsp) + #if MB == 0 +- movq %r12, -32(%rsp) +- movq %r13, -40(%rsp) ++ movq %r12, -32(%rsp) ++ movq %r13, -40(%rsp) + #endif + #ifdef BETAX + #define BOF -56 +- movss %xmm1, BOF(%rsp) +- movss %xmm1, BOF+4(%rsp) +- movss %xmm1, BOF+8(%rsp) +- movss %xmm1, BOF+12(%rsp) ++ movss %xmm1, BOF(%rsp) ++ movss %xmm1, BOF+4(%rsp) ++ movss %xmm1, BOF+8(%rsp) ++ movss %xmm1, BOF+12(%rsp) + #endif + /* + * pA already comes in right reg + * Initialize pB = B; pC = C; NBso = NB * sizeof; + */ +- movq %rsi, stN +- movq %rdi, %rax +- movq 16(%rsp), pC +- prefC((pC)) +- prefC(64(pC)) +- movq %r9, pB +- prefB((pB)) +- prefB(64(pB)) +- movq %rax, stM ++ movq %rsi, stN ++ movq %rdi, %rax ++ movq 16(%rsp), pC ++ prefC((pC)) ++ prefC(64(pC)) ++ movq %r9, pB ++ prefB((pB)) ++ prefB(64(pB)) ++ movq %rax, stM + /* + * stM = pA + NBNBso; stN = pB + NBNBso; + */ + #if MB == 0 +- movq stM, pfA +- imulq $NBso, pfA +- prefB(128(pB)) +- movq pfA, incAm +- addq pA5, pfA +- addq $176-NB14so, incAm ++ movq stM, pfA ++ imulq $NBso, pfA ++ prefB(128(pB)) ++ movq pfA, incAm ++ addq pA5, pfA ++ addq $176-NB14so, incAm + #else +- movq $MBKBso, pfA +- addq pA5, pfA +- prefB(128(pB)) ++ movq $MBKBso, pfA ++ addq pA5, pfA ++ prefB(128(pB)) + #endif + /* + * convert ldc to 64 bits, and then set incCn = (ldc - MB)*sizeof + */ +- movl 24(%rsp), %eax +- cltq +- movq %rax, incCn +- subq stM, incCn +- addq $14, incCn ++ movl 24(%rsp), %eax ++ cltq ++ movq %rax, incCn ++ subq stM, incCn ++ addq $14, incCn + #ifdef SREAL +- shl $2, incCn ++ shl $2, incCn + #else +- shl $3, incCn +- prefC(128(pC)) +- prefC(192(pC)) ++ shl $3, incCn ++ prefC(128(pC)) ++ prefC(192(pC)) + #endif + /* + * Find M/14 if MB is not set + */ + #if MB == 0 +- cmp $84, stM +- jne MB_LT84 +-/* movq $84/14, stM */ +- movq $6, stM ++ cmp $84, stM ++ jne MB_LT84 ++/* movq $84/14, stM */ ++ movq $6, stM + MBFOUND: +- subq $1, stM +- movq stM, stM0 ++ subq $1, stM ++ movq stM, stM0 + #endif +- addq $120, pA5 +- addq $120, pB0 +- movq $KB*4, ldab +- movq $-KB*5*4, mldab5 +- movq $-KB*4, mldab +- subq mldab5, pA5 +- lea KB*4(pA5, ldab,4), pA10 +-/* movq $NB, stN */ ++ addq $120, pA5 ++ addq $120, pB0 ++ movq $KB*4, ldab ++ movq $-KB*5*4, mldab5 ++ movq $-KB*4, mldab ++ subq mldab5, pA5 ++ lea KB*4(pA5, ldab,4), pA10 ++/* movq $NB, stN */ + + UNLOOP: + #if MB == 0 +- movq stM0, stM +- cmp $0, stM +- je MLAST ++ movq stM0, stM ++ cmp $0, stM ++ je MLAST + #else + #ifdef ATL_DivAns +- movq $ATL_DivAns-1, stM ++ movq $ATL_DivAns-1, stM + #else +- movq $MB/14-1, stM ++ movq $MB/14-1, stM + #endif + #endif + #if MB == 0 || MB > 14 +@@ -227,992 +234,992 @@ UMLOOP: + /* + * rC[0-13] = pC[0-13] * beta + */ +- ALIGN16 ++ ALIGN16 + /*UKLOOP: */ + #ifdef BETA1 +- movaps 0-120(pA10,mldab5,2), rC0 +- movaps 0-120(pB0), rB0 +- mulps rB0, rC0 +- addss (pC), rC0 +- movaps 0-120(pA5, mldab,4), rC1 +- mulps rB0, rC1 +- addss CMUL(4)(pC), rC1 +- movaps 0-120(pA10, mldab,8), rC2 +- mulps rB0, rC2 +- addss CMUL(8)(pC), rC2 +- movaps 0-120(pA5, mldab,2), rC3 +- mulps rB0, rC3 +- addss CMUL(12)(pC), rC3 +- movaps 0-120(pA5, mldab), rC4 +- mulps rB0, rC4 +- addss CMUL(16)(pC), rC4 +- movaps 0-120(pA5), rC5 +- mulps rB0, rC5 +- addss CMUL(20)(pC), rC5 +- movaps 0-120(pA5, ldab), rC6 +- mulps rB0, rC6 +- addss CMUL(24)(pC), rC6 +- movaps 0-120(pA5, ldab,2), rC7 +- mulps rB0, rC7 +- addss CMUL(28)(pC), rC7 +- movaps 0-120(pA10, mldab,2), rC8 +- mulps rB0, rC8 +- addss CMUL(32)(pC), rC8 +- movaps 0-120(pA5,ldab,4), rC9 +- mulps rB0, rC9 +- addss CMUL(36)(pC), rC9 +- movaps 0-120(pA10), rC10 +- mulps rB0, rC10 +- addss CMUL(40)(pC), rC10 +- movaps 0-120(pA10,ldab), rC11 +- mulps rB0, rC11 +- addss CMUL(44)(pC), rC11 +- movaps 0-120(pA10,ldab,2), rC12 +- mulps rB0, rC12 +- addss CMUL(48)(pC), rC12 +- movaps 0-120(pA5,ldab,8), rC13 +- mulps rB0, rC13 +- addss CMUL(52)(pC), rC13 ++ movaps 0-120(pA10,mldab5,2), rC0 ++ movaps 0-120(pB0), rB0 ++ mulps rB0, rC0 ++ addss (pC), rC0 ++ movaps 0-120(pA5, mldab,4), rC1 ++ mulps rB0, rC1 ++ addss CMUL(4)(pC), rC1 ++ movaps 0-120(pA10, mldab,8), rC2 ++ mulps rB0, rC2 ++ addss CMUL(8)(pC), rC2 ++ movaps 0-120(pA5, mldab,2), rC3 ++ mulps rB0, rC3 ++ addss CMUL(12)(pC), rC3 ++ movaps 0-120(pA5, mldab), rC4 ++ mulps rB0, rC4 ++ addss CMUL(16)(pC), rC4 ++ movaps 0-120(pA5), rC5 ++ mulps rB0, rC5 ++ addss CMUL(20)(pC), rC5 ++ movaps 0-120(pA5, ldab), rC6 ++ mulps rB0, rC6 ++ addss CMUL(24)(pC), rC6 ++ movaps 0-120(pA5, ldab,2), rC7 ++ mulps rB0, rC7 ++ addss CMUL(28)(pC), rC7 ++ movaps 0-120(pA10, mldab,2), rC8 ++ mulps rB0, rC8 ++ addss CMUL(32)(pC), rC8 ++ movaps 0-120(pA5,ldab,4), rC9 ++ mulps rB0, rC9 ++ addss CMUL(36)(pC), rC9 ++ movaps 0-120(pA10), rC10 ++ mulps rB0, rC10 ++ addss CMUL(40)(pC), rC10 ++ movaps 0-120(pA10,ldab), rC11 ++ mulps rB0, rC11 ++ addss CMUL(44)(pC), rC11 ++ movaps 0-120(pA10,ldab,2), rC12 ++ mulps rB0, rC12 ++ addss CMUL(48)(pC), rC12 ++ movaps 0-120(pA5,ldab,8), rC13 ++ mulps rB0, rC13 ++ addss CMUL(52)(pC), rC13 + #else +- movaps 0-120(pA10,mldab5,2), rC0 +- movaps 0-120(pB0), rC13 +- mulps rC13, rC0 +- movaps 0-120(pA5, mldab,4), rC1 +- mulps rC13, rC1 +- movaps 0-120(pA10, mldab,8), rC2 +- mulps rC13, rC2 +- movaps 0-120(pA5, mldab,2), rC3 +- mulps rC13, rC3 +- movaps 0-120(pA5, mldab), rC4 +- mulps rC13, rC4 +- movaps 0-120(pA5), rC5 +- mulps rC13, rC5 +- movaps 0-120(pA5, ldab), rC6 +- mulps rC13, rC6 +- movaps 0-120(pA5, ldab,2), rC7 +- mulps rC13, rC7 +- movaps 0-120(pA10, mldab,2), rC8 +- mulps rC13, rC8 +- movaps 0-120(pA5,ldab,4), rC9 +- mulps rC13, rC9 +- movaps 0-120(pA10), rC10 +- mulps rC13, rC10 +- movaps 0-120(pA10,ldab), rC11 +- mulps rC13, rC11 +- movaps 0-120(pA10,ldab,2), rC12 +- mulps rC13, rC12 +- mulps 0-120(pA5,ldab,8), rC13 ++ movaps 0-120(pA10,mldab5,2), rC0 ++ movaps 0-120(pB0), rC13 ++ mulps rC13, rC0 ++ movaps 0-120(pA5, mldab,4), rC1 ++ mulps rC13, rC1 ++ movaps 0-120(pA10, mldab,8), rC2 ++ mulps rC13, rC2 ++ movaps 0-120(pA5, mldab,2), rC3 ++ mulps rC13, rC3 ++ movaps 0-120(pA5, mldab), rC4 ++ mulps rC13, rC4 ++ movaps 0-120(pA5), rC5 ++ mulps rC13, rC5 ++ movaps 0-120(pA5, ldab), rC6 ++ mulps rC13, rC6 ++ movaps 0-120(pA5, ldab,2), rC7 ++ mulps rC13, rC7 ++ movaps 0-120(pA10, mldab,2), rC8 ++ mulps rC13, rC8 ++ movaps 0-120(pA5,ldab,4), rC9 ++ mulps rC13, rC9 ++ movaps 0-120(pA10), rC10 ++ mulps rC13, rC10 ++ movaps 0-120(pA10,ldab), rC11 ++ mulps rC13, rC11 ++ movaps 0-120(pA10,ldab,2), rC12 ++ mulps rC13, rC12 ++ mulps 0-120(pA5,ldab,8), rC13 + #endif + + #if KB > 4 +- movaps 16-120(pA10,mldab5,2), rA0 +- movaps 16-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 16-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 16-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 16-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 16-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 16-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 16-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 16-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 16-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 16-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 16-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 16-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 16-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 16-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 16-120(pA10,mldab5,2), rA0 ++ movaps 16-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 16-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 16-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 16-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 16-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 16-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 16-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 16-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 16-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 16-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 16-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 16-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 16-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 16-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 8 +- movaps 32-120(pA10,mldab5,2), rA0 +- movaps 32-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 32-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 32-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 32-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 32-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 32-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 32-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 32-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 32-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 32-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 32-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 32-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 32-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 32-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 32-120(pA10,mldab5,2), rA0 ++ movaps 32-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 32-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 32-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 32-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 32-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 32-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 32-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 32-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 32-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 32-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 32-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 32-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 32-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 32-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 12 +- movaps 48-120(pA10,mldab5,2), rA0 +- movaps 48-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 48-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 48-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 48-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 48-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 48-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 48-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 48-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 48-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 48-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 48-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 48-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 48-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 48-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 48-120(pA10,mldab5,2), rA0 ++ movaps 48-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 48-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 48-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 48-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 48-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 48-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 48-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 48-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 48-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 48-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 48-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 48-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 48-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 48-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 16 +- movaps 64-120(pA10,mldab5,2), rA0 +- movaps 64-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 64-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 64-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 64-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 64-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 64-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 64-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 64-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 64-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 64-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 64-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 64-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 64-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 64-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 64-120(pA10,mldab5,2), rA0 ++ movaps 64-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 64-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 64-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 64-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 64-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 64-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 64-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 64-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 64-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 64-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 64-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 64-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 64-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 64-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 20 +- movaps 80-120(pA10,mldab5,2), rA0 +- movaps 80-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 80-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 80-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 80-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 80-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 80-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 80-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 80-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 80-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 80-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 80-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 80-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 80-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 80-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 80-120(pA10,mldab5,2), rA0 ++ movaps 80-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 80-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 80-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 80-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 80-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 80-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 80-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 80-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 80-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 80-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 80-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 80-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 80-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 80-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 24 +- movaps 96-120(pA10,mldab5,2), rA0 +- movaps 96-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 96-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 96-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 96-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 96-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 96-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 96-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 96-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 96-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 96-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 96-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 96-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 96-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 96-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 96-120(pA10,mldab5,2), rA0 ++ movaps 96-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 96-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 96-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 96-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 96-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 96-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 96-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 96-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 96-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 96-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 96-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 96-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 96-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 96-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 28 +- movaps 112-120(pA10,mldab5,2), rA0 +- movaps 112-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 112-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 112-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 112-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 112-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 112-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 112-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 112-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 112-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 112-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 112-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 112-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 112-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 112-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 112-120(pA10,mldab5,2), rA0 ++ movaps 112-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 112-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 112-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 112-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 112-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 112-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 112-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 112-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 112-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 112-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 112-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 112-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 112-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 112-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + #ifndef SREAL +- pref2((pfA)) +- pref2(64(pfA)) ++ pref2((pfA)) ++ pref2(64(pfA)) + #endif + + #if KB > 32 +- movaps 128-120(pA10,mldab5,2), rA0 +- movaps 128-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 128-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 128-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 128-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 128-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 128-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 128-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 128-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 128-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 128-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 128-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 128-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 128-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 128-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 128-120(pA10,mldab5,2), rA0 ++ movaps 128-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 128-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 128-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 128-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 128-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 128-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 128-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 128-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 128-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 128-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 128-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 128-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 128-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 128-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 36 +- movaps 144-120(pA10,mldab5,2), rA0 +- movaps 144-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 144-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 144-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 144-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 144-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 144-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 144-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 144-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 144-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 144-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 144-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 144-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 144-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 144-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 144-120(pA10,mldab5,2), rA0 ++ movaps 144-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 144-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 144-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 144-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 144-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 144-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 144-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 144-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 144-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 144-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 144-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 144-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 144-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 144-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 40 +- movaps 160-120(pA10,mldab5,2), rA0 +- movaps 160-120(pB0), rB0 +- mulps rB0, rA0 +- addq $176, pB0 +- addps rA0, rC0 +- movaps 160-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 160-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 160-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 160-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 160-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 160-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 160-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 160-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 160-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 160-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 160-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 160-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addq $176, pA10 +- addps rA0, rC12 +- mulps 160-120(pA5,ldab,8), rB0 +- addps rB0, rC13 +- addq $176, pA5 ++ movaps 160-120(pA10,mldab5,2), rA0 ++ movaps 160-120(pB0), rB0 ++ mulps rB0, rA0 ++ addq $176, pB0 ++ addps rA0, rC0 ++ movaps 160-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 160-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 160-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 160-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 160-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 160-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 160-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 160-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 160-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 160-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 160-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 160-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addq $176, pA10 ++ addps rA0, rC12 ++ mulps 160-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 ++ addq $176, pA5 + #else +- addq $176, pB0 +- addq $176, pA10 +- addq $176, pA5 ++ addq $176, pB0 ++ addq $176, pA10 ++ addq $176, pA5 + #endif + + #if KB > 44 +- movaps 0-120(pA10,mldab5,2), rA0 +- movaps 0-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 0-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 0-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 0-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 0-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 0-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 0-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 0-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 0-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 0-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 0-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 0-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 0-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 0-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 0-120(pA10,mldab5,2), rA0 ++ movaps 0-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 0-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 0-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 0-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 0-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 0-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 0-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 0-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 0-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 0-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 0-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 0-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 0-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 0-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 48 +- movaps 16-120(pA10,mldab5,2), rA0 +- movaps 16-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 16-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 16-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 16-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 16-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 16-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 16-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 16-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 16-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 16-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 16-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 16-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 16-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 16-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 16-120(pA10,mldab5,2), rA0 ++ movaps 16-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 16-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 16-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 16-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 16-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 16-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 16-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 16-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 16-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 16-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 16-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 16-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 16-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 16-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 52 +- movaps 32-120(pA10,mldab5,2), rA0 +- movaps 32-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 32-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 32-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 32-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 32-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 32-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 32-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 32-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 32-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 32-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 32-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 32-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 32-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 32-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 32-120(pA10,mldab5,2), rA0 ++ movaps 32-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 32-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 32-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 32-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 32-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 32-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 32-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 32-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 32-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 32-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 32-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 32-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 32-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 32-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 56 +- movaps 48-120(pA10,mldab5,2), rA0 +- movaps 48-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 48-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 48-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 48-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 48-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 48-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 48-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 48-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 48-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 48-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 48-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 48-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 48-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 48-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 48-120(pA10,mldab5,2), rA0 ++ movaps 48-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 48-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 48-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 48-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 48-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 48-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 48-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 48-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 48-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 48-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 48-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 48-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 48-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 48-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 60 +- movaps 64-120(pA10,mldab5,2), rA0 +- movaps 64-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 64-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 64-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 64-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 64-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 64-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 64-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 64-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 64-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 64-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 64-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 64-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 64-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 64-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 64-120(pA10,mldab5,2), rA0 ++ movaps 64-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 64-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 64-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 64-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 64-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 64-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 64-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 64-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 64-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 64-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 64-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 64-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 64-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 64-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 64 +- movaps 80-120(pA10,mldab5,2), rA0 +- movaps 80-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 80-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 80-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 80-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 80-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 80-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 80-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 80-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 80-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 80-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 80-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 80-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 80-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 80-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 80-120(pA10,mldab5,2), rA0 ++ movaps 80-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 80-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 80-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 80-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 80-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 80-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 80-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 80-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 80-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 80-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 80-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 80-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 80-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 80-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 68 +- movaps 96-120(pA10,mldab5,2), rA0 +- movaps 96-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 96-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 96-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 96-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 96-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 96-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 96-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 96-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 96-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 96-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 96-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 96-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 96-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 96-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 96-120(pA10,mldab5,2), rA0 ++ movaps 96-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 96-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 96-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 96-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 96-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 96-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 96-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 96-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 96-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 96-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 96-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 96-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 96-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 96-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 72 +- movaps 112-120(pA10,mldab5,2), rA0 +- movaps 112-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 112-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 112-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 112-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 112-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 112-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 112-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 112-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 112-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 112-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 112-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 112-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 112-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 112-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 112-120(pA10,mldab5,2), rA0 ++ movaps 112-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 112-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 112-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 112-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 112-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 112-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 112-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 112-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 112-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 112-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 112-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 112-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 112-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 112-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 76 +- movaps 128-120(pA10,mldab5,2), rA0 +- movaps 128-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 128-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 128-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 128-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 128-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 128-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 128-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 128-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 128-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 128-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 128-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 128-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 128-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 128-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 128-120(pA10,mldab5,2), rA0 ++ movaps 128-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 128-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 128-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 128-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 128-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 128-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 128-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 128-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 128-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 128-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 128-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 128-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 128-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 128-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 80 +- movaps 144-120(pA10,mldab5,2), rA0 +- movaps 144-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 144-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 144-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 144-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 144-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 144-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 144-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 144-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 144-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 144-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 144-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 144-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 144-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 144-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 144-120(pA10,mldab5,2), rA0 ++ movaps 144-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 144-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 144-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 144-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 144-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 144-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 144-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 144-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 144-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 144-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 144-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 144-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 144-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 144-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + /*UKLOOP */ +@@ -1220,234 +1227,234 @@ UMLOOP: + * Get these bastard things summed up correctly + */ + +- /* rC0 = c0a c0b c0c c0d */ +- /* rC1 = c1a c1b c1c c1d */ +- /* rC2 = c2a c2b c2c c2d */ +- /* rC3 = c3a c3b c3c c3d */ ++ /* rC0 = c0a c0b c0c c0d */ ++ /* rC1 = c1a c1b c1c c1d */ ++ /* rC2 = c2a c2b c2c c2d */ ++ /* rC3 = c3a c3b c3c c3d */ + /* */ +- movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */ +- prefC((pC)) +- prefC(64(pC)) +- movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */ +- unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */ +- unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */ +- unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */ +- movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */ +- unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */ +- movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */ +- movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */ +- movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */ +- addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */ +- movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */ +- movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */ +- movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */ +- addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */ +- movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */ +- addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */ +- +- +- /* rC4 = c4a c4b c4c c4d */ +- /* rC5 = c5a c5b c5c c5d */ +- /* rC6 = c6a c6b c6c c6d */ +- /* rC7 = c7a c7b c7c c7d */ +- /* rC8 = c08a c08b c08c c08d */ +- /* rC9 = c09a c09b c09c c09d */ +- /* rC10 = c10a c10b c10c c10d */ +- /* rC11 = c11a c11b c11c c11d */ +- /* rC12 = c12a c12b c12c c12d */ +- /* rC13 = c13a c13b c13c c13d */ ++ movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */ ++ prefC((pC)) ++ prefC(64(pC)) ++ movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */ ++ unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */ ++ unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */ ++ unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */ ++ movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */ ++ unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */ ++ movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */ ++ movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */ ++ movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */ ++ addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */ ++ movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */ ++ movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */ ++ movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */ ++ addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */ ++ movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */ ++ addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */ ++ ++ ++ /* rC4 = c4a c4b c4c c4d */ ++ /* rC5 = c5a c5b c5c c5d */ ++ /* rC6 = c6a c6b c6c c6d */ ++ /* rC7 = c7a c7b c7c c7d */ ++ /* rC8 = c08a c08b c08c c08d */ ++ /* rC9 = c09a c09b c09c c09d */ ++ /* rC10 = c10a c10b c10c c10d */ ++ /* rC11 = c11a c11b c11c c11d */ ++ /* rC12 = c12a c12b c12c c12d */ ++ /* rC13 = c13a c13b c13c c13d */ + /* */ +- movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */ +- prefC(128(pC)) ++ movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */ ++ prefC(128(pC)) + #ifdef SREAL +- pref2((pfA)) ++ pref2((pfA)) + #else +- prefC(192(pC)) ++ prefC(192(pC)) + #endif +- movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */ +- movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */ +- unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */ +- unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */ +- unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */ +- unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */ +- unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */ +- movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */ +- unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */ +- movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */ +- movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */ +- unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */ +- movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */ +- movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */ +- addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */ ++ movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */ ++ movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */ ++ unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */ ++ unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */ ++ unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */ ++ unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */ ++ unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */ ++ movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */ ++ unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */ ++ movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */ ++ movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */ ++ unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */ ++ movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */ ++ movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */ ++ addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */ + #ifdef BETAX + #ifdef SREAL +- movups (pC), rA0 +- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ +- movups 16(pC), rC4 +- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ +- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ +- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ +- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ +- movups 32(pC), rC5 +- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ +- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ +- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ +- movlps 48(pC), rC1 +- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ +- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ +- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ +- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ +- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ +- pref2(64(pfA)) +- mulps BOF(%rsp), rA0 +- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ +- mulps BOF(%rsp), rC4 +- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ +- mulps BOF(%rsp), rC5 +- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ +- mulps BOF(%rsp), rC1 ++ movups (pC), rA0 ++ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ ++ movups 16(pC), rC4 ++ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ ++ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ ++ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ ++ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ ++ movups 32(pC), rC5 ++ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ ++ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ ++ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ ++ movlps 48(pC), rC1 ++ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ ++ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ ++ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ ++ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ ++ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ ++ pref2(64(pfA)) ++ mulps BOF(%rsp), rA0 ++ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ ++ mulps BOF(%rsp), rC4 ++ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ ++ mulps BOF(%rsp), rC5 ++ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ ++ mulps BOF(%rsp), rC1 + + /* */ + +- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ +- addps rA0, rC3 +- addq $68, pfA +- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ +- addps rC4, rC7 +- addps rC5, rC11 +- addps rC1, rC12 ++ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ ++ addps rA0, rC3 ++ addq $68, pfA ++ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ ++ addps rC4, rC7 ++ addps rC5, rC11 ++ addps rC1, rC12 + #else /* BETA = X, complex type */ +- movups (pC), rA0 +- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ +- movups 16(pC), rC4 +- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ +- shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */ +- movups 32(pC), rC4 /* rC4 = c4 X c5 X */ +- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ +- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ +- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ +- movups 48(pC), rC5 /* rC5 = c6 X c7 X */ +- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ +- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ +- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ +- shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */ +- movups 64(pC), rC5 /* rC5 = c8 X c9 X */ +- movups 80(pC), rC1 /* rC1 = c10 X c11 X */ +- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ +- shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */ +- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ +- movss 96(pC), rC1 +- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ +- movss 104(pC), rB0 +- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ +- unpcklps rB0, rC1 +- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ +- prefC(256(pC)) +- mulps BOF(%rsp), rA0 +- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ +- mulps BOF(%rsp), rC4 +- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ +- mulps BOF(%rsp), rC5 +- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ +- mulps BOF(%rsp), rC1 ++ movups (pC), rA0 ++ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ ++ movups 16(pC), rC4 ++ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ ++ shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */ ++ movups 32(pC), rC4 /* rC4 = c4 X c5 X */ ++ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ ++ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ ++ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ ++ movups 48(pC), rC5 /* rC5 = c6 X c7 X */ ++ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ ++ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ ++ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ ++ shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */ ++ movups 64(pC), rC5 /* rC5 = c8 X c9 X */ ++ movups 80(pC), rC1 /* rC1 = c10 X c11 X */ ++ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ ++ shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */ ++ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ ++ movss 96(pC), rC1 ++ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ ++ movss 104(pC), rB0 ++ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ ++ unpcklps rB0, rC1 ++ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ ++ prefC(256(pC)) ++ mulps BOF(%rsp), rA0 ++ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ ++ mulps BOF(%rsp), rC4 ++ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ ++ mulps BOF(%rsp), rC5 ++ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ ++ mulps BOF(%rsp), rC1 + + /* */ + +- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ +- addps rA0, rC3 +- prefC(192(pC)) +- addq $68, pfA +- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ +- addps rC4, rC7 +- addps rC5, rC11 +- addps rC1, rC12 ++ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ ++ addps rA0, rC3 ++ prefC(192(pC)) ++ addq $68, pfA ++ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ ++ addps rC4, rC7 ++ addps rC5, rC11 ++ addps rC1, rC12 + #endif + + #else +- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ +- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ +- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ +- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ +- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ +- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ +- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ +- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ +- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ +- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ +- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ +- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ +- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ ++ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ ++ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ ++ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ ++ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ ++ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ ++ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ ++ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ ++ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ ++ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ ++ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ ++ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ ++ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ ++ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ + #ifdef SREAL +- pref2(64(pfA)) ++ pref2(64(pfA)) + #else +- prefC(256(pC)) ++ prefC(256(pC)) + #endif +- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ +- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ +- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ ++ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ ++ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ ++ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ + + /* */ + +- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ ++ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ + #ifndef SREAL +- prefC(192(pC)) ++ prefC(192(pC)) + #endif +- addq $68, pfA +- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ ++ addq $68, pfA ++ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ + + #endif + /* + * Write results back to C; pC += 14; + */ + #ifdef SREAL +- movups rC3, (pC) +- movups rC7, 16(pC) +- movups rC11, 32(pC) +- movlps rC12, 48(pC) +- addq $56, pC ++ movups rC3, (pC) ++ movups rC7, 16(pC) ++ movups rC11, 32(pC) ++ movlps rC12, 48(pC) ++ addq $56, pC + #else +- movss rC3, (pC) +- movss rC7, 32(pC) +- movhlps rC3, rC0 +- movhlps rC7, rC6 +- movss rC0, 16(pC) +- movss rC6, 48(pC) +- shufps $0x55, rC3, rC3 +- shufps $0x55, rC7, rC7 +- movss rC3, 8(pC) +- movss rC7, 40(pC) +- shufps $0x55, rC0, rC0 +- shufps $0x55, rC6, rC6 +- movss rC0, 24(pC) +- movss rC6, 56(pC) +- +- movss rC11, 64(pC) +- movhlps rC11, rC2 +- movss rC12, 96(pC) +- movss rC2, 80(pC) +- shufps $0x55, rC11, rC11 +- shufps $0x55, rC12, rC12 +- movss rC11, 72(pC) +- shufps $0x55, rC2, rC2 +- movss rC12, 104(pC) +- movss rC2, 88(pC) ++ movss rC3, (pC) ++ movss rC7, 32(pC) ++ movhlps rC3, rC0 ++ movhlps rC7, rC6 ++ movss rC0, 16(pC) ++ movss rC6, 48(pC) ++ shufps $0x55, rC3, rC3 ++ shufps $0x55, rC7, rC7 ++ movss rC3, 8(pC) ++ movss rC7, 40(pC) ++ shufps $0x55, rC0, rC0 ++ shufps $0x55, rC6, rC6 ++ movss rC0, 24(pC) ++ movss rC6, 56(pC) ++ ++ movss rC11, 64(pC) ++ movhlps rC11, rC2 ++ movss rC12, 96(pC) ++ movss rC2, 80(pC) ++ shufps $0x55, rC11, rC11 ++ shufps $0x55, rC12, rC12 ++ movss rC11, 72(pC) ++ shufps $0x55, rC2, rC2 ++ movss rC12, 104(pC) ++ movss rC2, 88(pC) + +- addq $112, pC ++ addq $112, pC + #endif + /* + * Write results back to C + */ +- addq $NB14so-176, pA5 +- addq $NB14so-176, pA10 +- subq $176, pB0 ++ addq $NB14so-176, pA5 ++ addq $NB14so-176, pA10 ++ subq $176, pB0 + /* + * pC += 14; pA += 14*NB; pB -= NB; + */ + /* + * while (pA != stM); + */ +- subq $1, stM +- jne UMLOOP ++ subq $1, stM ++ jne UMLOOP + #endif + + /* +@@ -1459,994 +1466,994 @@ MLAST: + #endif + /*UKLOOP: */ + #ifdef BETA1 +- movaps 0-120(pA10,mldab5,2), rC0 +- movaps 0-120(pB0), rB0 +- mulps rB0, rC0 +- addss (pC), rC0 +- movaps 0-120(pA5, mldab,4), rC1 +- mulps rB0, rC1 +- addss CMUL(4)(pC), rC1 +- movaps 0-120(pA10, mldab,8), rC2 +- mulps rB0, rC2 +- addss CMUL(8)(pC), rC2 +- movaps 0-120(pA5, mldab,2), rC3 +- mulps rB0, rC3 +- addss CMUL(12)(pC), rC3 +- movaps 0-120(pA5, mldab), rC4 +- mulps rB0, rC4 +- addss CMUL(16)(pC), rC4 +- movaps 0-120(pA5), rC5 +- mulps rB0, rC5 +- addss CMUL(20)(pC), rC5 +- movaps 0-120(pA5, ldab), rC6 +- mulps rB0, rC6 +- addss CMUL(24)(pC), rC6 +- movaps 0-120(pA5, ldab,2), rC7 +- mulps rB0, rC7 +- addss CMUL(28)(pC), rC7 +- movaps 0-120(pA10, mldab,2), rC8 +- mulps rB0, rC8 +- addss CMUL(32)(pC), rC8 +- movaps 0-120(pA5,ldab,4), rC9 +- mulps rB0, rC9 +- addss CMUL(36)(pC), rC9 +- movaps 0-120(pA10), rC10 +- mulps rB0, rC10 +- addss CMUL(40)(pC), rC10 +- movaps 0-120(pA10,ldab), rC11 +- mulps rB0, rC11 +- addss CMUL(44)(pC), rC11 +- movaps 0-120(pA10,ldab,2), rC12 +- mulps rB0, rC12 +- addss CMUL(48)(pC), rC12 +- movaps 0-120(pA5,ldab,8), rC13 +- mulps rB0, rC13 +- addss CMUL(52)(pC), rC13 ++ movaps 0-120(pA10,mldab5,2), rC0 ++ movaps 0-120(pB0), rB0 ++ mulps rB0, rC0 ++ addss (pC), rC0 ++ movaps 0-120(pA5, mldab,4), rC1 ++ mulps rB0, rC1 ++ addss CMUL(4)(pC), rC1 ++ movaps 0-120(pA10, mldab,8), rC2 ++ mulps rB0, rC2 ++ addss CMUL(8)(pC), rC2 ++ movaps 0-120(pA5, mldab,2), rC3 ++ mulps rB0, rC3 ++ addss CMUL(12)(pC), rC3 ++ movaps 0-120(pA5, mldab), rC4 ++ mulps rB0, rC4 ++ addss CMUL(16)(pC), rC4 ++ movaps 0-120(pA5), rC5 ++ mulps rB0, rC5 ++ addss CMUL(20)(pC), rC5 ++ movaps 0-120(pA5, ldab), rC6 ++ mulps rB0, rC6 ++ addss CMUL(24)(pC), rC6 ++ movaps 0-120(pA5, ldab,2), rC7 ++ mulps rB0, rC7 ++ addss CMUL(28)(pC), rC7 ++ movaps 0-120(pA10, mldab,2), rC8 ++ mulps rB0, rC8 ++ addss CMUL(32)(pC), rC8 ++ movaps 0-120(pA5,ldab,4), rC9 ++ mulps rB0, rC9 ++ addss CMUL(36)(pC), rC9 ++ movaps 0-120(pA10), rC10 ++ mulps rB0, rC10 ++ addss CMUL(40)(pC), rC10 ++ movaps 0-120(pA10,ldab), rC11 ++ mulps rB0, rC11 ++ addss CMUL(44)(pC), rC11 ++ movaps 0-120(pA10,ldab,2), rC12 ++ mulps rB0, rC12 ++ addss CMUL(48)(pC), rC12 ++ movaps 0-120(pA5,ldab,8), rC13 ++ mulps rB0, rC13 ++ addss CMUL(52)(pC), rC13 + #else +- movaps 0-120(pA10,mldab5,2), rC0 +- movaps 0-120(pB0), rC13 +- mulps rC13, rC0 +- movaps 0-120(pA5, mldab,4), rC1 +- mulps rC13, rC1 +- movaps 0-120(pA10, mldab,8), rC2 +- mulps rC13, rC2 +- movaps 0-120(pA5, mldab,2), rC3 +- mulps rC13, rC3 +- movaps 0-120(pA5, mldab), rC4 +- mulps rC13, rC4 +- movaps 0-120(pA5), rC5 +- mulps rC13, rC5 +- movaps 0-120(pA5, ldab), rC6 +- mulps rC13, rC6 +- movaps 0-120(pA5, ldab,2), rC7 +- mulps rC13, rC7 +- movaps 0-120(pA10, mldab,2), rC8 +- mulps rC13, rC8 +- movaps 0-120(pA5,ldab,4), rC9 +- mulps rC13, rC9 +- movaps 0-120(pA10), rC10 +- mulps rC13, rC10 +- movaps 0-120(pA10,ldab), rC11 +- mulps rC13, rC11 +- movaps 0-120(pA10,ldab,2), rC12 +- mulps rC13, rC12 +- mulps 0-120(pA5,ldab,8), rC13 ++ movaps 0-120(pA10,mldab5,2), rC0 ++ movaps 0-120(pB0), rC13 ++ mulps rC13, rC0 ++ movaps 0-120(pA5, mldab,4), rC1 ++ mulps rC13, rC1 ++ movaps 0-120(pA10, mldab,8), rC2 ++ mulps rC13, rC2 ++ movaps 0-120(pA5, mldab,2), rC3 ++ mulps rC13, rC3 ++ movaps 0-120(pA5, mldab), rC4 ++ mulps rC13, rC4 ++ movaps 0-120(pA5), rC5 ++ mulps rC13, rC5 ++ movaps 0-120(pA5, ldab), rC6 ++ mulps rC13, rC6 ++ movaps 0-120(pA5, ldab,2), rC7 ++ mulps rC13, rC7 ++ movaps 0-120(pA10, mldab,2), rC8 ++ mulps rC13, rC8 ++ movaps 0-120(pA5,ldab,4), rC9 ++ mulps rC13, rC9 ++ movaps 0-120(pA10), rC10 ++ mulps rC13, rC10 ++ movaps 0-120(pA10,ldab), rC11 ++ mulps rC13, rC11 ++ movaps 0-120(pA10,ldab,2), rC12 ++ mulps rC13, rC12 ++ mulps 0-120(pA5,ldab,8), rC13 + #endif + + #if KB > 4 +- movaps 16-120(pA10,mldab5,2), rA0 +- movaps 16-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 16-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 16-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 16-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 16-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 16-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 16-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 16-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 16-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 16-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 16-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 16-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 16-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 16-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 16-120(pA10,mldab5,2), rA0 ++ movaps 16-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 16-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 16-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 16-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 16-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 16-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 16-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 16-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 16-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 16-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 16-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 16-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 16-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 16-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 8 +- movaps 32-120(pA10,mldab5,2), rA0 +- movaps 32-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 32-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 32-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 32-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 32-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 32-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 32-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 32-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 32-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 32-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 32-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 32-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 32-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 32-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 32-120(pA10,mldab5,2), rA0 ++ movaps 32-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 32-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 32-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 32-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 32-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 32-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 32-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 32-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 32-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 32-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 32-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 32-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 32-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 32-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 12 +- movaps 48-120(pA10,mldab5,2), rA0 +- movaps 48-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 48-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 48-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 48-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 48-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 48-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 48-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 48-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 48-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 48-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 48-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 48-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 48-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 48-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 48-120(pA10,mldab5,2), rA0 ++ movaps 48-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 48-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 48-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 48-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 48-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 48-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 48-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 48-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 48-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 48-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 48-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 48-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 48-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 48-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 16 +- movaps 64-120(pA10,mldab5,2), rA0 +- movaps 64-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 64-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 64-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 64-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 64-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 64-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 64-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 64-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 64-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 64-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 64-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 64-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 64-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 64-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 64-120(pA10,mldab5,2), rA0 ++ movaps 64-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 64-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 64-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 64-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 64-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 64-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 64-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 64-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 64-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 64-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 64-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 64-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 64-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 64-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 20 +- movaps 80-120(pA10,mldab5,2), rA0 +- movaps 80-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 80-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 80-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 80-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 80-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 80-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 80-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 80-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 80-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 80-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 80-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 80-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 80-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 80-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 80-120(pA10,mldab5,2), rA0 ++ movaps 80-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 80-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 80-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 80-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 80-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 80-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 80-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 80-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 80-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 80-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 80-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 80-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 80-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 80-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 24 +- movaps 96-120(pA10,mldab5,2), rA0 +- movaps 96-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 96-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 96-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 96-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 96-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 96-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 96-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 96-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 96-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 96-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 96-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 96-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 96-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 96-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 96-120(pA10,mldab5,2), rA0 ++ movaps 96-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 96-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 96-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 96-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 96-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 96-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 96-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 96-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 96-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 96-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 96-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 96-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 96-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 96-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 28 +- movaps 112-120(pA10,mldab5,2), rA0 +- movaps 112-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 112-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 112-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 112-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 112-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 112-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 112-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 112-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 112-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 112-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 112-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 112-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 112-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 112-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 112-120(pA10,mldab5,2), rA0 ++ movaps 112-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 112-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 112-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 112-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 112-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 112-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 112-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 112-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 112-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 112-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 112-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 112-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 112-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 112-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 32 +- movaps 128-120(pA10,mldab5,2), rA0 +- movaps 128-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 128-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 128-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 128-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 128-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 128-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 128-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 128-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 128-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 128-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 128-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 128-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 128-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 128-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 128-120(pA10,mldab5,2), rA0 ++ movaps 128-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 128-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 128-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 128-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 128-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 128-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 128-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 128-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 128-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 128-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 128-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 128-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 128-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 128-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 36 +- movaps 144-120(pA10,mldab5,2), rA0 +- movaps 144-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 144-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 144-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 144-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 144-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 144-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 144-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 144-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 144-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 144-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 144-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 144-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 144-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 144-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 144-120(pA10,mldab5,2), rA0 ++ movaps 144-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 144-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 144-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 144-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 144-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 144-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 144-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 144-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 144-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 144-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 144-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 144-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 144-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 144-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif +- prefB((pB,ldab)) +- prefB(64(pB,ldab)) ++ prefB((pB,ldab)) ++ prefB(64(pB,ldab)) + + #if KB > 40 +- movaps 160-120(pA10,mldab5,2), rA0 +- movaps 160-120(pB0), rB0 +- mulps rB0, rA0 +- addq $176, pB0 +- addps rA0, rC0 +- movaps 160-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 160-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 160-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 160-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 160-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 160-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 160-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 160-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 160-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 160-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 160-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 160-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addq $176, pA10 +- addps rA0, rC12 +- mulps 160-120(pA5,ldab,8), rB0 +- addps rB0, rC13 +- addq $176, pA5 ++ movaps 160-120(pA10,mldab5,2), rA0 ++ movaps 160-120(pB0), rB0 ++ mulps rB0, rA0 ++ addq $176, pB0 ++ addps rA0, rC0 ++ movaps 160-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 160-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 160-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 160-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 160-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 160-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 160-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 160-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 160-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 160-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 160-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 160-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addq $176, pA10 ++ addps rA0, rC12 ++ mulps 160-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 ++ addq $176, pA5 + #else +- addq $176, pB0 +- addq $176, pA10 +- addq $176, pA5 ++ addq $176, pB0 ++ addq $176, pA10 ++ addq $176, pA5 + #endif + + #if KB > 44 +- movaps 0-120(pA10,mldab5,2), rA0 +- movaps 0-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 0-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 0-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 0-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 0-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 0-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 0-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 0-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 0-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 0-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 0-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 0-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 0-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 0-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 0-120(pA10,mldab5,2), rA0 ++ movaps 0-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 0-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 0-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 0-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 0-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 0-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 0-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 0-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 0-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 0-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 0-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 0-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 0-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 0-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 48 +- movaps 16-120(pA10,mldab5,2), rA0 +- movaps 16-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 16-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 16-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 16-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 16-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 16-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 16-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 16-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 16-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 16-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 16-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 16-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 16-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 16-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 16-120(pA10,mldab5,2), rA0 ++ movaps 16-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 16-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 16-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 16-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 16-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 16-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 16-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 16-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 16-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 16-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 16-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 16-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 16-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 16-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 52 +- movaps 32-120(pA10,mldab5,2), rA0 +- movaps 32-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 32-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 32-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 32-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 32-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 32-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 32-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 32-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 32-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 32-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 32-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 32-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 32-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 32-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 32-120(pA10,mldab5,2), rA0 ++ movaps 32-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 32-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 32-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 32-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 32-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 32-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 32-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 32-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 32-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 32-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 32-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 32-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 32-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 32-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 56 +- movaps 48-120(pA10,mldab5,2), rA0 +- movaps 48-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 48-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 48-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 48-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 48-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 48-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 48-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 48-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 48-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 48-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 48-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 48-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 48-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 48-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 48-120(pA10,mldab5,2), rA0 ++ movaps 48-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 48-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 48-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 48-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 48-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 48-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 48-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 48-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 48-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 48-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 48-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 48-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 48-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 48-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 60 +- movaps 64-120(pA10,mldab5,2), rA0 +- movaps 64-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 64-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 64-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 64-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 64-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 64-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 64-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 64-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 64-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 64-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 64-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 64-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 64-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 64-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 64-120(pA10,mldab5,2), rA0 ++ movaps 64-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 64-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 64-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 64-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 64-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 64-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 64-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 64-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 64-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 64-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 64-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 64-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 64-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 64-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif +- prefB(128-176(pB,ldab)) +- prefB(192-176(pB,ldab)) ++ prefB(128-176(pB,ldab)) ++ prefB(192-176(pB,ldab)) + + #if KB > 64 +- movaps 80-120(pA10,mldab5,2), rA0 +- movaps 80-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 80-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 80-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 80-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 80-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 80-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 80-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 80-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 80-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 80-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 80-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 80-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 80-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 80-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 80-120(pA10,mldab5,2), rA0 ++ movaps 80-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 80-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 80-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 80-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 80-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 80-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 80-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 80-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 80-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 80-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 80-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 80-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 80-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 80-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 68 +- movaps 96-120(pA10,mldab5,2), rA0 +- movaps 96-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 96-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 96-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 96-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 96-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 96-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 96-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 96-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 96-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 96-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 96-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 96-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 96-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 96-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 96-120(pA10,mldab5,2), rA0 ++ movaps 96-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 96-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 96-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 96-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 96-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 96-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 96-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 96-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 96-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 96-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 96-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 96-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 96-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 96-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 72 +- movaps 112-120(pA10,mldab5,2), rA0 +- movaps 112-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 112-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 112-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 112-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 112-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 112-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 112-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 112-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 112-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 112-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 112-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 112-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 112-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 112-120(pA5,ldab,8), rB0 +- prefC((pC)) +- prefC((pC,incCn)) +- addps rB0, rC13 ++ movaps 112-120(pA10,mldab5,2), rA0 ++ movaps 112-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 112-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 112-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 112-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 112-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 112-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 112-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 112-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 112-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 112-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 112-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 112-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 112-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 112-120(pA5,ldab,8), rB0 ++ prefC((pC)) ++ prefC((pC,incCn)) ++ addps rB0, rC13 + #else +- prefC((pC)) +- prefC((pC,incCn)) ++ prefC((pC)) ++ prefC((pC,incCn)) + #endif + + #if KB > 76 +- movaps 128-120(pA10,mldab5,2), rA0 +- movaps 128-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 128-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 128-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 128-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 128-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 128-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 128-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 128-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 128-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 128-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 128-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 128-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 128-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 128-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 128-120(pA10,mldab5,2), rA0 ++ movaps 128-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 128-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 128-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 128-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 128-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 128-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 128-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 128-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 128-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 128-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 128-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 128-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 128-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 128-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + #if KB > 80 +- movaps 144-120(pA10,mldab5,2), rA0 +- movaps 144-120(pB0), rB0 +- mulps rB0, rA0 +- addps rA0, rC0 +- movaps 144-120(pA5, mldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC1 +- movaps 144-120(pA10, mldab,8), rA0 +- mulps rB0, rA0 +- addps rA0, rC2 +- movaps 144-120(pA5, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC3 +- movaps 144-120(pA5, mldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC4 +- movaps 144-120(pA5), rA0 +- mulps rB0, rA0 +- addps rA0, rC5 +- movaps 144-120(pA5, ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC6 +- movaps 144-120(pA5, ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC7 +- movaps 144-120(pA10, mldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC8 +- movaps 144-120(pA5,ldab,4), rA0 +- mulps rB0, rA0 +- addps rA0, rC9 +- movaps 144-120(pA10), rA0 +- mulps rB0, rA0 +- addps rA0, rC10 +- movaps 144-120(pA10,ldab), rA0 +- mulps rB0, rA0 +- addps rA0, rC11 +- movaps 144-120(pA10,ldab,2), rA0 +- mulps rB0, rA0 +- addps rA0, rC12 +- mulps 144-120(pA5,ldab,8), rB0 +- addps rB0, rC13 ++ movaps 144-120(pA10,mldab5,2), rA0 ++ movaps 144-120(pB0), rB0 ++ mulps rB0, rA0 ++ addps rA0, rC0 ++ movaps 144-120(pA5, mldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC1 ++ movaps 144-120(pA10, mldab,8), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC2 ++ movaps 144-120(pA5, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC3 ++ movaps 144-120(pA5, mldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC4 ++ movaps 144-120(pA5), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC5 ++ movaps 144-120(pA5, ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC6 ++ movaps 144-120(pA5, ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC7 ++ movaps 144-120(pA10, mldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC8 ++ movaps 144-120(pA5,ldab,4), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC9 ++ movaps 144-120(pA10), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC10 ++ movaps 144-120(pA10,ldab), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC11 ++ movaps 144-120(pA10,ldab,2), rA0 ++ mulps rB0, rA0 ++ addps rA0, rC12 ++ mulps 144-120(pA5,ldab,8), rB0 ++ addps rB0, rC13 + #endif + + /*UKLOOP */ +@@ -2454,202 +2461,202 @@ MLAST: + * Get these bastard things summed up correctly + */ + +- /* rC0 = c0a c0b c0c c0d */ +- /* rC1 = c1a c1b c1c c1d */ +- /* rC2 = c2a c2b c2c c2d */ +- /* rC3 = c3a c3b c3c c3d */ ++ /* rC0 = c0a c0b c0c c0d */ ++ /* rC1 = c1a c1b c1c c1d */ ++ /* rC2 = c2a c2b c2c c2d */ ++ /* rC3 = c3a c3b c3c c3d */ + /* */ +- movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */ +- prefC(64(pC,incCn)) +- prefB(256-176(pB,ldab)) +- movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */ +- unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */ +- unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */ +- unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */ +- movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */ +- unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */ +- movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */ +- movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */ +- movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */ +- addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */ +- movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */ +- movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */ +- movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */ +- addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */ +- movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */ +- addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */ +- +- +- /* rC4 = c4a c4b c4c c4d */ +- /* rC5 = c5a c5b c5c c5d */ +- /* rC6 = c6a c6b c6c c6d */ +- /* rC7 = c7a c7b c7c c7d */ +- /* rC8 = c08a c08b c08c c08d */ +- /* rC9 = c09a c09b c09c c09d */ +- /* rC10 = c10a c10b c10c c10d */ +- /* rC11 = c11a c11b c11c c11d */ +- /* rC12 = c12a c12b c12c c12d */ +- /* rC13 = c13a c13b c13c c13d */ ++ movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */ ++ prefC(64(pC,incCn)) ++ prefB(256-176(pB,ldab)) ++ movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */ ++ unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */ ++ unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */ ++ unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */ ++ movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */ ++ unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */ ++ movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */ ++ movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */ ++ movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */ ++ addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */ ++ movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */ ++ movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */ ++ movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */ ++ addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */ ++ movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */ ++ addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */ ++ ++ ++ /* rC4 = c4a c4b c4c c4d */ ++ /* rC5 = c5a c5b c5c c5d */ ++ /* rC6 = c6a c6b c6c c6d */ ++ /* rC7 = c7a c7b c7c c7d */ ++ /* rC8 = c08a c08b c08c c08d */ ++ /* rC9 = c09a c09b c09c c09d */ ++ /* rC10 = c10a c10b c10c c10d */ ++ /* rC11 = c11a c11b c11c c11d */ ++ /* rC12 = c12a c12b c12c c12d */ ++ /* rC13 = c13a c13b c13c c13d */ + /* */ +- movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */ +- movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */ +- movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */ +- unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */ +- unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */ +- unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */ +- unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */ +- unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */ +- movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */ +- unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */ +- movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */ +- movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */ +- unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */ +- movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */ +- movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */ +- addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */ ++ movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */ ++ movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */ ++ movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */ ++ unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */ ++ unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */ ++ unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */ ++ unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */ ++ unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */ ++ movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */ ++ unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */ ++ movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */ ++ movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */ ++ unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */ ++ movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */ ++ movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */ ++ addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */ + #ifdef BETAX + #ifdef SREAL +- movups (pC), rA0 +- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ +- movups 16(pC), rC4 +- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ +- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ +- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ +- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ +- movups 32(pC), rC5 +- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ +- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ +- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ +- movlps 48(pC), rC1 +- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ +- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ +- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ +- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ +- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ +- mulps BOF(%rsp), rA0 +- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ +- mulps BOF(%rsp), rC4 +- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ +- mulps BOF(%rsp), rC5 +- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ +- mulps BOF(%rsp), rC1 ++ movups (pC), rA0 ++ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ ++ movups 16(pC), rC4 ++ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ ++ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ ++ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ ++ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ ++ movups 32(pC), rC5 ++ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ ++ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ ++ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ ++ movlps 48(pC), rC1 ++ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ ++ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ ++ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ ++ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ ++ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ ++ mulps BOF(%rsp), rA0 ++ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ ++ mulps BOF(%rsp), rC4 ++ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ ++ mulps BOF(%rsp), rC5 ++ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ ++ mulps BOF(%rsp), rC1 + + /* */ + +- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ +- addps rA0, rC3 +- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ +- addps rC4, rC7 +- addps rC5, rC11 +- prefB(320-176(pB,ldab)) +- addps rC1, rC12 ++ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ ++ addps rA0, rC3 ++ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ ++ addps rC4, rC7 ++ addps rC5, rC11 ++ prefB(320-176(pB,ldab)) ++ addps rC1, rC12 + #else /* BETA = X, complex type */ +- movups (pC), rA0 +- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ +- movups 16(pC), rC4 +- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ +- shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */ +- movups 32(pC), rC4 /* rC4 = c4 X c5 X */ +- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ +- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ +- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ +- movups 48(pC), rC5 /* rC5 = c6 X c7 X */ +- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ +- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ +- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ +- shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */ +- movups 64(pC), rC5 /* rC5 = c8 X c9 X */ +- movups 80(pC), rC1 /* rC1 = c10 X c11 X */ +- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ +- shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */ +- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ +- movss 96(pC), rC1 +- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ +- movss 104(pC), rB0 +- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ +- unpcklps rB0, rC1 +- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ +- mulps BOF(%rsp), rA0 +- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ +- mulps BOF(%rsp), rC4 +- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ +- mulps BOF(%rsp), rC5 +- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ +- mulps BOF(%rsp), rC1 ++ movups (pC), rA0 ++ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ ++ movups 16(pC), rC4 ++ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ ++ shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */ ++ movups 32(pC), rC4 /* rC4 = c4 X c5 X */ ++ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ ++ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ ++ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ ++ movups 48(pC), rC5 /* rC5 = c6 X c7 X */ ++ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ ++ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ ++ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ ++ shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */ ++ movups 64(pC), rC5 /* rC5 = c8 X c9 X */ ++ movups 80(pC), rC1 /* rC1 = c10 X c11 X */ ++ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ ++ shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */ ++ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ ++ movss 96(pC), rC1 ++ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ ++ movss 104(pC), rB0 ++ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ ++ unpcklps rB0, rC1 ++ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ ++ mulps BOF(%rsp), rA0 ++ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ ++ mulps BOF(%rsp), rC4 ++ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ ++ mulps BOF(%rsp), rC5 ++ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ ++ mulps BOF(%rsp), rC1 + + /* */ + +- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ +- addps rA0, rC3 +- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ +- addps rC4, rC7 +- addps rC5, rC11 +- prefB(320-176(pB,ldab)) +- addps rC1, rC12 ++ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ ++ addps rA0, rC3 ++ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ ++ addps rC4, rC7 ++ addps rC5, rC11 ++ prefB(320-176(pB,ldab)) ++ addps rC1, rC12 + #endif + + #else +- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ +- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ +- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ +- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ +- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ +- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ +- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ +- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ +- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ +- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ +- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ +- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ +- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ +- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ +- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ +- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ ++ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ ++ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ ++ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ ++ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ ++ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ ++ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ ++ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ ++ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ ++ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ ++ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ ++ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ ++ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ ++ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ ++ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ ++ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ ++ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ + + /* */ + +- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ +- prefB(320-176(pB,ldab)) +- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ ++ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ ++ prefB(320-176(pB,ldab)) ++ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ + + #endif + /* + * Write results back to C; pC += 14; + */ + #ifdef SREAL +- movups rC3, (pC) +- movups rC7, 16(pC) +- movups rC11, 32(pC) +- movlps rC12, 48(pC) +-/* addq $56, pC */ ++ movups rC3, (pC) ++ movups rC7, 16(pC) ++ movups rC11, 32(pC) ++ movlps rC12, 48(pC) ++/* addq $56, pC */ + #else +- movss rC3, (pC) +- movss rC7, 32(pC) +- movhlps rC3, rC0 +- movhlps rC7, rC6 +- movss rC0, 16(pC) +- movss rC6, 48(pC) +- shufps $0x55, rC3, rC3 +- shufps $0x55, rC7, rC7 +- movss rC3, 8(pC) +- movss rC7, 40(pC) +- shufps $0x55, rC0, rC0 +- shufps $0x55, rC6, rC6 +- movss rC0, 24(pC) +- movss rC6, 56(pC) +- +- movss rC11, 64(pC) +- movhlps rC11, rC2 +- movss rC12, 96(pC) +- movss rC2, 80(pC) +- shufps $0x55, rC11, rC11 +- shufps $0x55, rC12, rC12 +- movss rC11, 72(pC) +- shufps $0x55, rC2, rC2 +- movss rC12, 104(pC) +- movss rC2, 88(pC) ++ movss rC3, (pC) ++ movss rC7, 32(pC) ++ movhlps rC3, rC0 ++ movhlps rC7, rC6 ++ movss rC0, 16(pC) ++ movss rC6, 48(pC) ++ shufps $0x55, rC3, rC3 ++ shufps $0x55, rC7, rC7 ++ movss rC3, 8(pC) ++ movss rC7, 40(pC) ++ shufps $0x55, rC0, rC0 ++ shufps $0x55, rC6, rC6 ++ movss rC0, 24(pC) ++ movss rC6, 56(pC) ++ ++ movss rC11, 64(pC) ++ movhlps rC11, rC2 ++ movss rC12, 96(pC) ++ movss rC2, 80(pC) ++ shufps $0x55, rC11, rC11 ++ shufps $0x55, rC12, rC12 ++ movss rC11, 72(pC) ++ shufps $0x55, rC2, rC2 ++ movss rC12, 104(pC) ++ movss rC2, 88(pC) + +-/* addq $112, pC */ ++/* addq $112, pC */ + #endif + /* + * Write results back to C +@@ -2660,55 +2667,55 @@ MLAST: + /* + * while (pA != stM); + */ +-/* subq $1, stM */ +-/* jne UMLOOP */ ++/* subq $1, stM */ ++/* jne UMLOOP */ + /* + * pC += 14; pA += 14*NB; pB -= NB; + */ +-/* subq $MBKBso-NB14so+176, pA5 */ +-/* subq $MBKBso-NB14so+176, pA10 */ +- subq incAm, pA5 +- subq incAm, pA10 +- addq $NBso-176, pB0 ++/* subq $MBKBso-NB14so+176, pA5 */ ++/* subq $MBKBso-NB14so+176, pA10 */ ++ subq incAm, pA5 ++ subq incAm, pA10 ++ addq $NBso-176, pB0 + /* + * while (pA != stM); + */ +-/* subq $1, stM */ +-/* jne UMLOOP */ ++/* subq $1, stM */ ++/* jne UMLOOP */ + /* + * pC += incCn; pA -= NBNB; pB += NB; + */ +- addq incCn, pC ++ addq incCn, pC + /* + * while (pB != stN); + */ +- sub $1, stN +- jne UNLOOP ++ sub $1, stN ++ jne UNLOOP + + /* + * Restore callee-saved iregs + */ + DONE: +- movq -8(%rsp), %rbp +- movq -16(%rsp), %rbx ++ movq -8(%rsp), %rbp ++ movq -16(%rsp), %rbx + #if MB == 0 +- movq -32(%rsp), %r12 +- movq -40(%rsp), %r13 ++ movq -32(%rsp), %r12 ++ movq -40(%rsp), %r13 + #endif +- ret ++ ret + #if MB == 0 + MB_LT84: +- cmp $70, stM +- jne MB_LT70 +-/* movq $70/14, stM */ +- movq $5, stM +- jmp MBFOUND ++ cmp $70, stM ++ jne MB_LT70 ++/* movq $70/14, stM */ ++ movq $5, stM ++ jmp MBFOUND + MB_LT70: +- cmp $56, stM +- jne MB_LT56 +-/* movq $56/14, stM */ +- movq $4, stM +- jmp MBFOUND ++ cmp $56, stM ++ jne MB_LT56 ++/* movq $56/14, stM */ ++ movq $4, stM ++ jmp MBFOUND + MB_LT56: + cmp $42, stM + jne MB_LT42 +diff -rupN ATLAS/tune/blas/level1/scalsrch.c atlas-3.8.3/tune/blas/level1/scalsrch.c +--- ATLAS/tune/blas/level1/scalsrch.c 2009-02-18 19:48:25.000000000 +0100 ++++ atlas-3.8.3/tune/blas/level1/scalsrch.c 2009-11-12 13:45:48.141174024 +0100 +@@ -747,7 +747,7 @@ void GenMainRout(char pre, int n, int *i + /* + * Handle all special alpha cases + */ +- fprintf(fpout, "%sif ( SCALAR_IS_ZERO(alpha) )\n", spc); ++ /* fprintf(fpout, "%sif ( SCALAR_IS_ZERO(alpha) )\n", spc); + fprintf(fpout, "%s{\n", spc); + if (pre == 'c' || pre == 'z') + { +@@ -756,7 +756,7 @@ void GenMainRout(char pre, int n, int *i + } + else fprintf(fpout, "%s Mjoin(PATL,set)(N, ATL_rzero, X, incx);\n", spc); + fprintf(fpout, "%s return;\n", spc); +- fprintf(fpout, "%s}\n", spc); ++ fprintf(fpout, "%s}\n", spc); */ + GenAlphCase(pre, spc, fpout, 1, n, ix, iy, ia, ib); + GenAlphCase(pre, spc, fpout, -1, n, ix, iy, ia, ib); + if (pre == 'c' || pre == 'z') |