diff options
Diffstat (limited to 'src/field_5x52_asm.asm')
-rw-r--r-- | src/field_5x52_asm.asm | 469 |
1 files changed, 469 insertions, 0 deletions
diff --git a/src/field_5x52_asm.asm b/src/field_5x52_asm.asm new file mode 100644 index 0000000000..5e785f7630 --- /dev/null +++ b/src/field_5x52_asm.asm @@ -0,0 +1,469 @@ + ;; Added by Diederik Huys, March 2013 + ;; + ;; Provided public procedures: + ;; secp256k1_fe_mul_inner + ;; secp256k1_fe_sqr_inner + ;; + ;; Needed tools: YASM (http://yasm.tortall.net) + ;; + ;; + + BITS 64 + +%ifidn __OUTPUT_FORMAT__,macho64 +%define SYM(x) _ %+ x +%else +%define SYM(x) x +%endif + + ;; Procedure ExSetMult + ;; Register Layout: + ;; INPUT: rdi = a->n + ;; rsi = b->n + ;; rdx = r->a + ;; + ;; INTERNAL: rdx:rax = multiplication accumulator + ;; r9:r8 = c + ;; r10-r13 = t0-t3 + ;; r14 = b.n[0] / t4 + ;; r15 = b.n[1] / t5 + ;; rbx = b.n[2] / t6 + ;; rcx = b.n[3] / t7 + ;; rbp = Constant 0FFFFFFFFFFFFFh / t8 + ;; rsi = b.n / b.n[4] / t9 + + GLOBAL SYM(secp256k1_fe_mul_inner) + ALIGN 32 +SYM(secp256k1_fe_mul_inner): + push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + push rdx + mov r14,[rsi+8*0] ; preload b.n[0]. This will be the case until + ; b.n[0] is no longer needed, then we reassign + ; r14 to t4 + ;; c=a.n[0] * b.n[0] + mov rax,[rdi+0*8] ; load a.n[0] + mov rbp,0FFFFFFFFFFFFFh + mul r14 ; rdx:rax=a.n[0]*b.n[0] + mov r15,[rsi+1*8] + mov r10,rbp ; load modulus into target register for t0 + mov r8,rax + and r10,rax ; only need lower qword of c + shrd r8,rdx,52 + xor r9,r9 ; c < 2^64, so we ditch the HO part + + ;; c+=a.n[0] * b.n[1] + a.n[1] * b.n[0] + mov rax,[rdi+0*8] + mul r15 + add r8,rax + adc r9,rdx + + mov rax,[rdi+1*8] + mul r14 + mov r11,rbp + mov rbx,[rsi+2*8] + add r8,rax + adc r9,rdx + and r11,r8 + shrd r8,r9,52 + xor r9,r9 + + ;; c+=a.n[0 1 2] * b.n[2 1 0] + mov rax,[rdi+0*8] + mul rbx + add r8,rax + adc r9,rdx + + mov rax,[rdi+1*8] + mul r15 + add r8,rax + adc r9,rdx + + mov rax,[rdi+2*8] + mul r14 + mov r12,rbp + mov rcx,[rsi+3*8] + add r8,rax + adc r9,rdx + and r12,r8 + shrd r8,r9,52 + xor r9,r9 + + ;; c+=a.n[0 1 2 3] * b.n[3 2 1 0] + mov rax,[rdi+0*8] + mul rcx + add r8,rax + adc r9,rdx + + mov rax,[rdi+1*8] + mul rbx + add r8,rax + adc r9,rdx + + mov rax,[rdi+2*8] + mul r15 + add r8,rax + adc r9,rdx + + mov rax,[rdi+3*8] + mul r14 + mov r13,rbp + mov rsi,[rsi+4*8] ; load b.n[4] and destroy pointer + add r8,rax + adc r9,rdx + and r13,r8 + + shrd r8,r9,52 + xor r9,r9 + + + ;; c+=a.n[0 1 2 3 4] * b.n[4 3 2 1 0] + mov rax,[rdi+0*8] + mul rsi + add r8,rax + adc r9,rdx + + mov rax,[rdi+1*8] + mul rcx + add r8,rax + adc r9,rdx + + mov rax,[rdi+2*8] + mul rbx + add r8,rax + adc r9,rdx + + mov rax,[rdi+3*8] + mul r15 + add r8,rax + adc r9,rdx + + mov rax,[rdi+4*8] + mul r14 + mov r14,rbp ; load modulus into t4 and destroy a.n[0] + add r8,rax + adc r9,rdx + and r14,r8 + shrd r8,r9,52 + xor r9,r9 + + ;; c+=a.n[1 2 3 4] * b.n[4 3 2 1] + mov rax,[rdi+1*8] + mul rsi + add r8,rax + adc r9,rdx + + mov rax,[rdi+2*8] + mul rcx + add r8,rax + adc r9,rdx + + mov rax,[rdi+3*8] + mul rbx + add r8,rax + adc r9,rdx + + mov rax,[rdi+4*8] + mul r15 + mov r15,rbp + add r8,rax + adc r9,rdx + + and r15,r8 + shrd r8,r9,52 + xor r9,r9 + + ;; c+=a.n[2 3 4] * b.n[4 3 2] + mov rax,[rdi+2*8] + mul rsi + add r8,rax + adc r9,rdx + + mov rax,[rdi+3*8] + mul rcx + add r8,rax + adc r9,rdx + + mov rax,[rdi+4*8] + mul rbx + mov rbx,rbp + add r8,rax + adc r9,rdx + + and rbx,r8 + shrd r8,r9,52 + xor r9,r9 + + ;; c+=a.n[3 4] * b.n[4 3] + mov rax,[rdi+3*8] + mul rsi + add r8,rax + adc r9,rdx + + mov rax,[rdi+4*8] + mul rcx + mov rcx,rbp + add r8,rax + adc r9,rdx + and rcx,r8 + shrd r8,r9,52 + xor r9,r9 + + ;; c+=a.n[4] * b.n[4] + mov rax,[rdi+4*8] + mul rsi + ;; mov rbp,rbp ; modulus already there! + add r8,rax + adc r9,rdx + and rbp,r8 + shrd r8,r9,52 + xor r9,r9 + + mov rsi,r8 ; load c into t9 and destroy b.n[4] + + ;; ******************************************************* +common_exit_norm: + mov rdi,01000003D10h ; load constant + + mov rax,r15 ; get t5 + mul rdi + add rax,r10 ; +t0 + adc rdx,0 + mov r10,0FFFFFFFFFFFFFh ; modulus. Sadly, we ran out of registers! + mov r8,rax ; +c + and r10,rax + shrd r8,rdx,52 + xor r9,r9 + + mov rax,rbx ; get t6 + mul rdi + add rax,r11 ; +t1 + adc rdx,0 + mov r11,0FFFFFFFFFFFFFh ; modulus + add r8,rax ; +c + adc r9,rdx + and r11,r8 + shrd r8,r9,52 + xor r9,r9 + + mov rax,rcx ; get t7 + mul rdi + add rax,r12 ; +t2 + adc rdx,0 + pop rbx ; retrieve pointer to this.n + mov r12,0FFFFFFFFFFFFFh ; modulus + add r8,rax ; +c + adc r9,rdx + and r12,r8 + mov [rbx+2*8],r12 ; mov into this.n[2] + shrd r8,r9,52 + xor r9,r9 + + mov rax,rbp ; get t8 + mul rdi + add rax,r13 ; +t3 + adc rdx,0 + mov r13,0FFFFFFFFFFFFFh ; modulus + add r8,rax ; +c + adc r9,rdx + and r13,r8 + mov [rbx+3*8],r13 ; -> this.n[3] + shrd r8,r9,52 + xor r9,r9 + + mov rax,rsi ; get t9 + mul rdi + add rax,r14 ; +t4 + adc rdx,0 + mov r14,0FFFFFFFFFFFFh ; !!! + add r8,rax ; +c + adc r9,rdx + and r14,r8 + mov [rbx+4*8],r14 ; -> this.n[4] + shrd r8,r9,48 ; !!! + xor r9,r9 + + mov rax,01000003D1h + mul r8 + add rax,r10 + adc rdx,0 + mov r10,0FFFFFFFFFFFFFh ; modulus + mov r8,rax + and rax,r10 + shrd r8,rdx,52 + mov [rbx+0*8],rax ; -> this.n[0] + add r8,r11 + mov [rbx+1*8],r8 ; -> this.n[1] + + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + pop rbp + ret + + + ;; PROC ExSetSquare + ;; Register Layout: + ;; INPUT: rdi = a.n + ;; rsi = this.a + ;; INTERNAL: rdx:rax = multiplication accumulator + ;; r9:r8 = c + ;; r10-r13 = t0-t3 + ;; r14 = a.n[0] / t4 + ;; r15 = a.n[1] / t5 + ;; rbx = a.n[2] / t6 + ;; rcx = a.n[3] / t7 + ;; rbp = 0FFFFFFFFFFFFFh / t8 + ;; rsi = a.n[4] / t9 + GLOBAL SYM(secp256k1_fe_sqr_inner) + ALIGN 32 +SYM(secp256k1_fe_sqr_inner): + push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + push rsi + mov rbp,0FFFFFFFFFFFFFh + + ;; c=a.n[0] * a.n[0] + mov r14,[rdi+0*8] ; r14=a.n[0] + mov r10,rbp ; modulus + mov rax,r14 + mul rax + mov r15,[rdi+1*8] ; a.n[1] + add r14,r14 ; r14=2*a.n[0] + mov r8,rax + and r10,rax ; only need lower qword + shrd r8,rdx,52 + xor r9,r9 + + ;; c+=2*a.n[0] * a.n[1] + mov rax,r14 ; r14=2*a.n[0] + mul r15 + mov rbx,[rdi+2*8] ; rbx=a.n[2] + mov r11,rbp ; modulus + add r8,rax + adc r9,rdx + and r11,r8 + shrd r8,r9,52 + xor r9,r9 + + ;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1] + mov rax,r14 + mul rbx + add r8,rax + adc r9,rdx + + mov rax,r15 + mov r12,rbp ; modulus + mul rax + mov rcx,[rdi+3*8] ; rcx=a.n[3] + add r15,r15 ; r15=a.n[1]*2 + add r8,rax + adc r9,rdx + and r12,r8 ; only need lower dword + shrd r8,r9,52 + xor r9,r9 + + ;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2] + mov rax,r14 + mul rcx + add r8,rax + adc r9,rdx + + mov rax,r15 ; rax=2*a.n[1] + mov r13,rbp ; modulus + mul rbx + mov rsi,[rdi+4*8] ; rsi=a.n[4] + add r8,rax + adc r9,rdx + and r13,r8 + shrd r8,r9,52 + xor r9,r9 + + ;; c+=2*a.n[0]*a.n[4]+2*a.n[1]*a.n[3]+a.n[2]*a.n[2] + mov rax,r14 ; last time we need 2*a.n[0] + mul rsi + add r8,rax + adc r9,rdx + + mov rax,r15 + mul rcx + mov r14,rbp ; modulus + add r8,rax + adc r9,rdx + + mov rax,rbx + mul rax + add rbx,rbx ; rcx=2*a.n[2] + add r8,rax + adc r9,rdx + and r14,r8 + shrd r8,r9,52 + xor r9,r9 + + ;; c+=2*a.n[1]*a.n[4]+2*a.n[2]*a.n[3] + mov rax,r15 ; last time we need 2*a.n[1] + mul rsi + add r8,rax + adc r9,rdx + + mov rax,rbx + mul rcx + mov r15,rbp ; modulus + add r8,rax + adc r9,rdx + and r15,r8 + shrd r8,r9,52 + xor r9,r9 + + ;; c+=2*a.n[2]*a.n[4]+a.n[3]*a.n[3] + mov rax,rbx ; last time we need 2*a.n[2] + mul rsi + add r8,rax + adc r9,rdx + + mov rax,rcx ; a.n[3] + mul rax + mov rbx,rbp ; modulus + add r8,rax + adc r9,rdx + and rbx,r8 ; only need lower dword + lea rax,[2*rcx] + shrd r8,r9,52 + xor r9,r9 + + ;; c+=2*a.n[3]*a.n[4] + mul rsi + mov rcx,rbp ; modulus + add r8,rax + adc r9,rdx + and rcx,r8 ; only need lower dword + shrd r8,r9,52 + xor r9,r9 + + ;; c+=a.n[4]*a.n[4] + mov rax,rsi + mul rax + ;; mov rbp,rbp ; modulus is already there! + add r8,rax + adc r9,rdx + and rbp,r8 + shrd r8,r9,52 + xor r9,r9 + + mov rsi,r8 + + ;; ******************************************************* + jmp common_exit_norm + end + + |