1 files changed, 469 insertions, 0 deletions
diff --git a/src/field_5x52_asm.asm b/src/field_5x52_asm.asm
new file mode 100644
index 0000000000..5e785f7630
--- /dev/null
+++ b/src/field_5x52_asm.asm
@@ -0,0 +1,469 @@
+	;; Added by Diederik Huys, March 2013
+	;;
+	;; Provided public procedures:
+	;; 	secp256k1_fe_mul_inner
+	;; 	secp256k1_fe_sqr_inner
+	;;
+	;; Needed tools: YASM (http://yasm.tortall.net)
+	;;
+	;; 
+
+	BITS 64
+
+%ifidn   __OUTPUT_FORMAT__,macho64
+%define SYM(x) _ %+ x
+%else
+%define SYM(x) x
+%endif
+
+	;;  Procedure ExSetMult
+	;;  Register Layout:
+	;;  INPUT: 	rdi	= a->n
+	;; 	   	rsi  	= b->n
+	;; 	   	rdx  	= r->a
+	;; 
+	;;  INTERNAL:	rdx:rax  = multiplication accumulator
+	;; 		r9:r8    = c
+	;; 		r10-r13  = t0-t3
+	;; 		r14	 = b.n[0] / t4
+	;; 		r15	 = b.n[1] / t5
+	;; 		rbx	 = b.n[2] / t6
+	;; 		rcx	 = b.n[3] / t7
+	;; 		rbp	 = Constant 0FFFFFFFFFFFFFh / t8
+	;; 		rsi	 = b.n / b.n[4] / t9
+
+	GLOBAL SYM(secp256k1_fe_mul_inner)
+	ALIGN 32
+SYM(secp256k1_fe_mul_inner):
+	push rbp
+	push rbx
+	push r12
+	push r13
+	push r14
+	push r15
+	push rdx
+	mov r14,[rsi+8*0]	; preload b.n[0]. This will be the case until
+				; b.n[0] is no longer needed, then we reassign
+				; r14 to t4
+	;; c=a.n[0] * b.n[0]
+   	mov rax,[rdi+0*8]	; load a.n[0]
+	mov rbp,0FFFFFFFFFFFFFh
+	mul r14			; rdx:rax=a.n[0]*b.n[0]
+	mov r15,[rsi+1*8]
+	mov r10,rbp		; load modulus into target register for t0
+	mov r8,rax
+	and r10,rax		; only need lower qword of c
+	shrd r8,rdx,52
+	xor r9,r9		; c < 2^64, so we ditch the HO part 
+
+	;; c+=a.n[0] * b.n[1] + a.n[1] * b.n[0]
+	mov rax,[rdi+0*8]
+	mul r15			
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+1*8]
+	mul r14			
+	mov r11,rbp
+	mov rbx,[rsi+2*8]
+	add r8,rax
+	adc r9,rdx
+	and r11,r8
+	shrd r8,r9,52
+	xor r9,r9
+	
+	;; c+=a.n[0 1 2] * b.n[2 1 0]
+	mov rax,[rdi+0*8]
+	mul rbx			
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+1*8]
+	mul r15			
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+2*8]
+	mul r14
+	mov r12,rbp		
+	mov rcx,[rsi+3*8]
+	add r8,rax
+	adc r9,rdx
+	and r12,r8		
+	shrd r8,r9,52
+	xor r9,r9		
+
+	;; c+=a.n[0 1 2 3] * b.n[3 2 1 0]
+	mov rax,[rdi+0*8]
+	mul rcx			
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+1*8]
+	mul rbx			
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+2*8]
+	mul r15			
+	add r8,rax
+	adc r9,rdx
+	
+	mov rax,[rdi+3*8]
+	mul r14			
+	mov r13,rbp             
+	mov rsi,[rsi+4*8]	; load b.n[4] and destroy pointer
+	add r8,rax
+	adc r9,rdx
+	and r13,r8
+
+	shrd r8,r9,52
+	xor r9,r9		
+
+
+	;; c+=a.n[0 1 2 3 4] * b.n[4 3 2 1 0]
+	mov rax,[rdi+0*8]
+	mul rsi
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+1*8]
+	mul rcx
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+2*8]
+	mul rbx			
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+3*8]
+	mul r15			
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+4*8]
+	mul r14			
+	mov r14,rbp             ; load modulus into t4 and destroy a.n[0]
+	add r8,rax
+	adc r9,rdx
+	and r14,r8
+	shrd r8,r9,52
+	xor r9,r9		
+
+	;; c+=a.n[1 2 3 4] * b.n[4 3 2 1]
+	mov rax,[rdi+1*8]
+	mul rsi
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+2*8]
+	mul rcx
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+3*8]
+	mul rbx
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+4*8]
+	mul r15
+	mov r15,rbp		
+	add r8,rax
+	adc r9,rdx
+
+	and r15,r8
+	shrd r8,r9,52
+	xor r9,r9		
+
+	;; c+=a.n[2 3 4] * b.n[4 3 2]
+	mov rax,[rdi+2*8]
+	mul rsi
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+3*8]
+	mul rcx
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+4*8]
+	mul rbx
+	mov rbx,rbp		
+	add r8,rax
+	adc r9,rdx
+
+	and rbx,r8		
+	shrd r8,r9,52
+	xor r9,r9		
+
+	;; c+=a.n[3 4] * b.n[4 3]
+	mov rax,[rdi+3*8]
+	mul rsi
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+4*8]
+	mul rcx
+	mov rcx,rbp		
+	add r8,rax
+	adc r9,rdx
+	and rcx,r8		
+	shrd r8,r9,52
+	xor r9,r9		
+
+	;; c+=a.n[4] * b.n[4]
+	mov rax,[rdi+4*8]
+	mul rsi
+	;; mov rbp,rbp		; modulus already there!
+	add r8,rax
+	adc r9,rdx
+	and rbp,r8 
+	shrd r8,r9,52
+	xor r9,r9		
+
+	mov rsi,r8		; load c into t9 and destroy b.n[4]
+
+	;; *******************************************************
+common_exit_norm:
+	mov rdi,01000003D10h	; load constant
+
+	mov rax,r15		; get t5
+	mul rdi
+	add rax,r10    		; +t0
+	adc rdx,0
+	mov r10,0FFFFFFFFFFFFFh ; modulus. Sadly, we ran out of registers!
+	mov r8,rax		; +c
+	and r10,rax
+	shrd r8,rdx,52
+	xor r9,r9
+
+	mov rax,rbx		; get t6
+	mul rdi
+	add rax,r11		; +t1
+	adc rdx,0
+	mov r11,0FFFFFFFFFFFFFh ; modulus
+	add r8,rax		; +c
+	adc r9,rdx
+	and r11,r8
+	shrd r8,r9,52
+	xor r9,r9
+
+	mov rax,rcx    		; get t7
+	mul rdi
+	add rax,r12		; +t2
+	adc rdx,0
+	pop rbx			; retrieve pointer to this.n	
+	mov r12,0FFFFFFFFFFFFFh ; modulus
+	add r8,rax		; +c
+	adc r9,rdx
+	and r12,r8
+	mov [rbx+2*8],r12	; mov into this.n[2]
+	shrd r8,r9,52
+	xor r9,r9
+	
+	mov rax,rbp    		; get t8
+	mul rdi
+	add rax,r13    		; +t3
+	adc rdx,0
+	mov r13,0FFFFFFFFFFFFFh ; modulus
+	add r8,rax		; +c
+	adc r9,rdx
+	and r13,r8
+	mov [rbx+3*8],r13	; -> this.n[3]
+	shrd r8,r9,52
+	xor r9,r9
+	
+	mov rax,rsi    		; get t9
+	mul rdi
+	add rax,r14    		; +t4
+	adc rdx,0
+	mov r14,0FFFFFFFFFFFFh	; !!!
+	add r8,rax		; +c
+	adc r9,rdx
+	and r14,r8
+	mov [rbx+4*8],r14	; -> this.n[4]
+	shrd r8,r9,48		; !!!
+	xor r9,r9
+	
+	mov rax,01000003D1h
+	mul r8		
+	add rax,r10
+	adc rdx,0
+	mov r10,0FFFFFFFFFFFFFh ; modulus
+	mov r8,rax
+	and rax,r10
+	shrd r8,rdx,52
+	mov [rbx+0*8],rax	; -> this.n[0]
+	add r8,r11
+	mov [rbx+1*8],r8	; -> this.n[1]
+
+	pop r15
+	pop r14
+	pop r13
+	pop r12
+	pop rbx
+	pop rbp
+	ret
+
+	
+	;;  PROC ExSetSquare
+	;;  Register Layout:
+	;;  INPUT: 	rdi	 = a.n
+	;; 	   	rsi  	 = this.a
+	;;  INTERNAL:	rdx:rax  = multiplication accumulator
+	;; 		r9:r8    = c
+	;; 		r10-r13  = t0-t3
+	;; 		r14	 = a.n[0] / t4
+	;; 		r15	 = a.n[1] / t5
+	;; 		rbx	 = a.n[2] / t6
+	;; 		rcx	 = a.n[3] / t7
+	;; 		rbp	 = 0FFFFFFFFFFFFFh / t8
+	;; 		rsi	 = a.n[4] / t9
+	GLOBAL SYM(secp256k1_fe_sqr_inner)
+	ALIGN 32
+SYM(secp256k1_fe_sqr_inner):
+	push rbp
+	push rbx
+	push r12
+	push r13
+	push r14
+	push r15
+	push rsi
+	mov rbp,0FFFFFFFFFFFFFh
+	
+	;; c=a.n[0] * a.n[0]
+   	mov r14,[rdi+0*8]	; r14=a.n[0]
+	mov r10,rbp		; modulus 
+	mov rax,r14
+	mul rax
+	mov r15,[rdi+1*8]	; a.n[1]
+	add r14,r14		; r14=2*a.n[0]
+	mov r8,rax
+	and r10,rax		; only need lower qword
+	shrd r8,rdx,52
+	xor r9,r9
+
+	;; c+=2*a.n[0] * a.n[1]
+	mov rax,r14		; r14=2*a.n[0]
+	mul r15
+	mov rbx,[rdi+2*8]	; rbx=a.n[2]
+	mov r11,rbp 		; modulus
+	add r8,rax
+	adc r9,rdx
+	and r11,r8
+	shrd r8,r9,52
+	xor r9,r9
+	
+	;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1]
+	mov rax,r14
+	mul rbx
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,r15
+	mov r12,rbp		; modulus
+	mul rax
+	mov rcx,[rdi+3*8]	; rcx=a.n[3]
+	add r15,r15		; r15=a.n[1]*2
+	add r8,rax
+	adc r9,rdx
+	and r12,r8		; only need lower dword
+	shrd r8,r9,52
+	xor r9,r9		
+
+	;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2]
+	mov rax,r14
+	mul rcx
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,r15		; rax=2*a.n[1]
+	mov r13,rbp		; modulus
+	mul rbx
+	mov rsi,[rdi+4*8]	; rsi=a.n[4]
+	add r8,rax
+	adc r9,rdx
+	and r13,r8
+	shrd r8,r9,52
+	xor r9,r9		
+
+	;; c+=2*a.n[0]*a.n[4]+2*a.n[1]*a.n[3]+a.n[2]*a.n[2]
+	mov rax,r14		; last time we need 2*a.n[0]
+	mul rsi
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,r15
+	mul rcx
+	mov r14,rbp		; modulus
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,rbx
+	mul rax
+	add rbx,rbx		; rcx=2*a.n[2]
+	add r8,rax
+	adc r9,rdx
+	and r14,r8
+	shrd r8,r9,52
+	xor r9,r9		
+
+	;; c+=2*a.n[1]*a.n[4]+2*a.n[2]*a.n[3]
+	mov rax,r15		; last time we need 2*a.n[1]
+	mul rsi
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,rbx
+	mul rcx
+	mov r15,rbp		; modulus
+	add r8,rax
+	adc r9,rdx
+	and r15,r8
+	shrd r8,r9,52
+	xor r9,r9		
+
+	;; c+=2*a.n[2]*a.n[4]+a.n[3]*a.n[3]
+	mov rax,rbx		; last time we need 2*a.n[2]
+	mul rsi
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,rcx		; a.n[3]
+	mul rax
+	mov rbx,rbp		; modulus
+	add r8,rax
+	adc r9,rdx
+	and rbx,r8		; only need lower dword
+	lea rax,[2*rcx]
+	shrd r8,r9,52
+	xor r9,r9		
+
+	;; c+=2*a.n[3]*a.n[4]
+	mul rsi
+	mov rcx,rbp		; modulus
+	add r8,rax
+	adc r9,rdx
+	and rcx,r8		; only need lower dword
+	shrd r8,r9,52
+	xor r9,r9		
+
+	;; c+=a.n[4]*a.n[4]
+	mov rax,rsi
+	mul rax
+	;; mov rbp,rbp		; modulus is already there!
+	add r8,rax
+	adc r9,rdx
+	and rbp,r8 
+	shrd r8,r9,52
+	xor r9,r9		
+
+	mov rsi,r8
+
+	;; *******************************************************
+	jmp common_exit_norm
+	end
+
+