aboutsummaryrefslogtreecommitdiff
path: root/src/field_5x52_asm.asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/field_5x52_asm.asm')
-rw-r--r--src/field_5x52_asm.asm469
1 files changed, 469 insertions, 0 deletions
diff --git a/src/field_5x52_asm.asm b/src/field_5x52_asm.asm
new file mode 100644
index 0000000000..5e785f7630
--- /dev/null
+++ b/src/field_5x52_asm.asm
@@ -0,0 +1,469 @@
+ ;; Added by Diederik Huys, March 2013
+ ;;
+ ;; Provided public procedures:
+ ;; secp256k1_fe_mul_inner
+ ;; secp256k1_fe_sqr_inner
+ ;;
+ ;; Needed tools: YASM (http://yasm.tortall.net)
+ ;;
+ ;;
+
+ BITS 64
+
+%ifidn __OUTPUT_FORMAT__,macho64
+%define SYM(x) _ %+ x
+%else
+%define SYM(x) x
+%endif
+
+ ;; Procedure ExSetMult
+ ;; Register Layout:
+ ;; INPUT: rdi = a->n
+ ;; rsi = b->n
+ ;; rdx = r->a
+ ;;
+ ;; INTERNAL: rdx:rax = multiplication accumulator
+ ;; r9:r8 = c
+ ;; r10-r13 = t0-t3
+ ;; r14 = b.n[0] / t4
+ ;; r15 = b.n[1] / t5
+ ;; rbx = b.n[2] / t6
+ ;; rcx = b.n[3] / t7
+ ;; rbp = Constant 0FFFFFFFFFFFFFh / t8
+ ;; rsi = b.n / b.n[4] / t9
+
+ GLOBAL SYM(secp256k1_fe_mul_inner)
+ ALIGN 32
+SYM(secp256k1_fe_mul_inner):
+ push rbp
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdx
+ mov r14,[rsi+8*0] ; preload b.n[0]. This will be the case until
+ ; b.n[0] is no longer needed, then we reassign
+ ; r14 to t4
+ ;; c=a.n[0] * b.n[0]
+ mov rax,[rdi+0*8] ; load a.n[0]
+ mov rbp,0FFFFFFFFFFFFFh
+ mul r14 ; rdx:rax=a.n[0]*b.n[0]
+ mov r15,[rsi+1*8]
+ mov r10,rbp ; load modulus into target register for t0
+ mov r8,rax
+ and r10,rax ; only need lower qword of c
+ shrd r8,rdx,52
+ xor r9,r9 ; c < 2^64, so we ditch the HO part
+
+ ;; c+=a.n[0] * b.n[1] + a.n[1] * b.n[0]
+ mov rax,[rdi+0*8]
+ mul r15
+ add r8,rax
+ adc r9,rdx
+
+ mov rax,[rdi+1*8]
+ mul r14
+ mov r11,rbp
+ mov rbx,[rsi+2*8]
+ add r8,rax
+ adc r9,rdx
+ and r11,r8
+ shrd r8,r9,52
+ xor r9,r9
+
+ ;; c+=a.n[0 1 2] * b.n[2 1 0]
+ mov rax,[rdi+0*8]
+ mul rbx
+ add r8,rax
+ adc r9,rdx
+
+ mov rax,[rdi+1*8]
+ mul r15
+ add r8,rax
+ adc r9,rdx
+
+ mov rax,[rdi+2*8]
+ mul r14
+ mov r12,rbp
+ mov rcx,[rsi+3*8]
+ add r8,rax
+ adc r9,rdx
+ and r12,r8
+ shrd r8,r9,52
+ xor r9,r9
+
+ ;; c+=a.n[0 1 2 3] * b.n[3 2 1 0]
+ mov rax,[rdi+0*8]
+ mul rcx
+ add r8,rax
+ adc r9,rdx
+
+ mov rax,[rdi+1*8]
+ mul rbx
+ add r8,rax
+ adc r9,rdx
+
+ mov rax,[rdi+2*8]
+ mul r15
+ add r8,rax
+ adc r9,rdx
+
+ mov rax,[rdi+3*8]
+ mul r14
+ mov r13,rbp
+ mov rsi,[rsi+4*8] ; load b.n[4] and destroy pointer
+ add r8,rax
+ adc r9,rdx
+ and r13,r8
+
+ shrd r8,r9,52
+ xor r9,r9
+
+
+ ;; c+=a.n[0 1 2 3 4] * b.n[4 3 2 1 0]
+ mov rax,[rdi+0*8]
+ mul rsi
+ add r8,rax
+ adc r9,rdx
+
+ mov rax,[rdi+1*8]
+ mul rcx
+ add r8,rax
+ adc r9,rdx
+
+ mov rax,[rdi+2*8]
+ mul rbx
+ add r8,rax
+ adc r9,rdx
+
+ mov rax,[rdi+3*8]
+ mul r15
+ add r8,rax
+ adc r9,rdx
+
+ mov rax,[rdi+4*8]
+ mul r14
+ mov r14,rbp ; load modulus into t4 and destroy a.n[0]
+ add r8,rax
+ adc r9,rdx
+ and r14,r8
+ shrd r8,r9,52
+ xor r9,r9
+
+ ;; c+=a.n[1 2 3 4] * b.n[4 3 2 1]
+ mov rax,[rdi+1*8]
+ mul rsi
+ add r8,rax
+ adc r9,rdx
+
+ mov rax,[rdi+2*8]
+ mul rcx
+ add r8,rax
+ adc r9,rdx
+
+ mov rax,[rdi+3*8]
+ mul rbx
+ add r8,rax
+ adc r9,rdx
+
+ mov rax,[rdi+4*8]
+ mul r15
+ mov r15,rbp
+ add r8,rax
+ adc r9,rdx
+
+ and r15,r8
+ shrd r8,r9,52
+ xor r9,r9
+
+ ;; c+=a.n[2 3 4] * b.n[4 3 2]
+ mov rax,[rdi+2*8]
+ mul rsi
+ add r8,rax
+ adc r9,rdx
+
+ mov rax,[rdi+3*8]
+ mul rcx
+ add r8,rax
+ adc r9,rdx
+
+ mov rax,[rdi+4*8]
+ mul rbx
+ mov rbx,rbp
+ add r8,rax
+ adc r9,rdx
+
+ and rbx,r8
+ shrd r8,r9,52
+ xor r9,r9
+
+ ;; c+=a.n[3 4] * b.n[4 3]
+ mov rax,[rdi+3*8]
+ mul rsi
+ add r8,rax
+ adc r9,rdx
+
+ mov rax,[rdi+4*8]
+ mul rcx
+ mov rcx,rbp
+ add r8,rax
+ adc r9,rdx
+ and rcx,r8
+ shrd r8,r9,52
+ xor r9,r9
+
+ ;; c+=a.n[4] * b.n[4]
+ mov rax,[rdi+4*8]
+ mul rsi
+ ;; mov rbp,rbp ; modulus already there!
+ add r8,rax
+ adc r9,rdx
+ and rbp,r8
+ shrd r8,r9,52
+ xor r9,r9
+
+ mov rsi,r8 ; load c into t9 and destroy b.n[4]
+
+ ;; *******************************************************
+common_exit_norm:
+ mov rdi,01000003D10h ; load constant
+
+ mov rax,r15 ; get t5
+ mul rdi
+ add rax,r10 ; +t0
+ adc rdx,0
+ mov r10,0FFFFFFFFFFFFFh ; modulus. Sadly, we ran out of registers!
+ mov r8,rax ; +c
+ and r10,rax
+ shrd r8,rdx,52
+ xor r9,r9
+
+ mov rax,rbx ; get t6
+ mul rdi
+ add rax,r11 ; +t1
+ adc rdx,0
+ mov r11,0FFFFFFFFFFFFFh ; modulus
+ add r8,rax ; +c
+ adc r9,rdx
+ and r11,r8
+ shrd r8,r9,52
+ xor r9,r9
+
+ mov rax,rcx ; get t7
+ mul rdi
+ add rax,r12 ; +t2
+ adc rdx,0
+ pop rbx ; retrieve pointer to this.n
+ mov r12,0FFFFFFFFFFFFFh ; modulus
+ add r8,rax ; +c
+ adc r9,rdx
+ and r12,r8
+ mov [rbx+2*8],r12 ; mov into this.n[2]
+ shrd r8,r9,52
+ xor r9,r9
+
+ mov rax,rbp ; get t8
+ mul rdi
+ add rax,r13 ; +t3
+ adc rdx,0
+ mov r13,0FFFFFFFFFFFFFh ; modulus
+ add r8,rax ; +c
+ adc r9,rdx
+ and r13,r8
+ mov [rbx+3*8],r13 ; -> this.n[3]
+ shrd r8,r9,52
+ xor r9,r9
+
+ mov rax,rsi ; get t9
+ mul rdi
+ add rax,r14 ; +t4
+ adc rdx,0
+ mov r14,0FFFFFFFFFFFFh ; !!!
+ add r8,rax ; +c
+ adc r9,rdx
+ and r14,r8
+ mov [rbx+4*8],r14 ; -> this.n[4]
+ shrd r8,r9,48 ; !!!
+ xor r9,r9
+
+ mov rax,01000003D1h
+ mul r8
+ add rax,r10
+ adc rdx,0
+ mov r10,0FFFFFFFFFFFFFh ; modulus
+ mov r8,rax
+ and rax,r10
+ shrd r8,rdx,52
+ mov [rbx+0*8],rax ; -> this.n[0]
+ add r8,r11
+ mov [rbx+1*8],r8 ; -> this.n[1]
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ pop rbp
+ ret
+
+
+ ;; PROC ExSetSquare
+ ;; Register Layout:
+ ;; INPUT: rdi = a.n
+ ;; rsi = this.a
+ ;; INTERNAL: rdx:rax = multiplication accumulator
+ ;; r9:r8 = c
+ ;; r10-r13 = t0-t3
+ ;; r14 = a.n[0] / t4
+ ;; r15 = a.n[1] / t5
+ ;; rbx = a.n[2] / t6
+ ;; rcx = a.n[3] / t7
+ ;; rbp = 0FFFFFFFFFFFFFh / t8
+ ;; rsi = a.n[4] / t9
+ GLOBAL SYM(secp256k1_fe_sqr_inner)
+ ALIGN 32
+SYM(secp256k1_fe_sqr_inner):
+ push rbp
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ push rsi
+ mov rbp,0FFFFFFFFFFFFFh
+
+ ;; c=a.n[0] * a.n[0]
+ mov r14,[rdi+0*8] ; r14=a.n[0]
+ mov r10,rbp ; modulus
+ mov rax,r14
+ mul rax
+ mov r15,[rdi+1*8] ; a.n[1]
+ add r14,r14 ; r14=2*a.n[0]
+ mov r8,rax
+ and r10,rax ; only need lower qword
+ shrd r8,rdx,52
+ xor r9,r9
+
+ ;; c+=2*a.n[0] * a.n[1]
+ mov rax,r14 ; r14=2*a.n[0]
+ mul r15
+ mov rbx,[rdi+2*8] ; rbx=a.n[2]
+ mov r11,rbp ; modulus
+ add r8,rax
+ adc r9,rdx
+ and r11,r8
+ shrd r8,r9,52
+ xor r9,r9
+
+ ;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1]
+ mov rax,r14
+ mul rbx
+ add r8,rax
+ adc r9,rdx
+
+ mov rax,r15
+ mov r12,rbp ; modulus
+ mul rax
+ mov rcx,[rdi+3*8] ; rcx=a.n[3]
+ add r15,r15 ; r15=a.n[1]*2
+ add r8,rax
+ adc r9,rdx
+ and r12,r8 ; only need lower dword
+ shrd r8,r9,52
+ xor r9,r9
+
+ ;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2]
+ mov rax,r14
+ mul rcx
+ add r8,rax
+ adc r9,rdx
+
+ mov rax,r15 ; rax=2*a.n[1]
+ mov r13,rbp ; modulus
+ mul rbx
+ mov rsi,[rdi+4*8] ; rsi=a.n[4]
+ add r8,rax
+ adc r9,rdx
+ and r13,r8
+ shrd r8,r9,52
+ xor r9,r9
+
+ ;; c+=2*a.n[0]*a.n[4]+2*a.n[1]*a.n[3]+a.n[2]*a.n[2]
+ mov rax,r14 ; last time we need 2*a.n[0]
+ mul rsi
+ add r8,rax
+ adc r9,rdx
+
+ mov rax,r15
+ mul rcx
+ mov r14,rbp ; modulus
+ add r8,rax
+ adc r9,rdx
+
+ mov rax,rbx
+ mul rax
+ add rbx,rbx ; rcx=2*a.n[2]
+ add r8,rax
+ adc r9,rdx
+ and r14,r8
+ shrd r8,r9,52
+ xor r9,r9
+
+ ;; c+=2*a.n[1]*a.n[4]+2*a.n[2]*a.n[3]
+ mov rax,r15 ; last time we need 2*a.n[1]
+ mul rsi
+ add r8,rax
+ adc r9,rdx
+
+ mov rax,rbx
+ mul rcx
+ mov r15,rbp ; modulus
+ add r8,rax
+ adc r9,rdx
+ and r15,r8
+ shrd r8,r9,52
+ xor r9,r9
+
+ ;; c+=2*a.n[2]*a.n[4]+a.n[3]*a.n[3]
+ mov rax,rbx ; last time we need 2*a.n[2]
+ mul rsi
+ add r8,rax
+ adc r9,rdx
+
+ mov rax,rcx ; a.n[3]
+ mul rax
+ mov rbx,rbp ; modulus
+ add r8,rax
+ adc r9,rdx
+ and rbx,r8 ; only need lower dword
+ lea rax,[2*rcx]
+ shrd r8,r9,52
+ xor r9,r9
+
+ ;; c+=2*a.n[3]*a.n[4]
+ mul rsi
+ mov rcx,rbp ; modulus
+ add r8,rax
+ adc r9,rdx
+ and rcx,r8 ; only need lower dword
+ shrd r8,r9,52
+ xor r9,r9
+
+ ;; c+=a.n[4]*a.n[4]
+ mov rax,rsi
+ mul rax
+ ;; mov rbp,rbp ; modulus is already there!
+ add r8,rax
+ adc r9,rdx
+ and rbp,r8
+ shrd r8,r9,52
+ xor r9,r9
+
+ mov rsi,r8
+
+ ;; *******************************************************
+ jmp common_exit_norm
+ end
+
+