aboutsummaryrefslogtreecommitdiff
path: root/lib/liblame/libmp3lame/i386/fftfpu.nas
diff options
context:
space:
mode:
Diffstat (limited to 'lib/liblame/libmp3lame/i386/fftfpu.nas')
-rw-r--r--lib/liblame/libmp3lame/i386/fftfpu.nas619
1 files changed, 619 insertions, 0 deletions
diff --git a/lib/liblame/libmp3lame/i386/fftfpu.nas b/lib/liblame/libmp3lame/i386/fftfpu.nas
new file mode 100644
index 0000000000..2ae89bd799
--- /dev/null
+++ b/lib/liblame/libmp3lame/i386/fftfpu.nas
@@ -0,0 +1,619 @@
+; back port from GOGO-no coda 2.24b by Takehiro TOMINAGA
+
+; GOGO-no-coda
+; Copyright (C) 1999 shigeo
+; special thanks to URURI
+
+%include "nasm.h"
+
+ externdef costab_fft
+ externdef sintab_fft
+
+ segment_data
+ align 32
+D_1_41421 dd 1.41421356
+D_1_0 dd 1.0
+D_0_5 dd 0.5
+D_0_25 dd 0.25
+D_0_0005 dd 0.0005
+D_0_0 dd 0.0
+
+ segment_code
+
+;void fht(float *fz, int n);
+proc fht_FPU
+
+%$fz arg 4
+%$n arg 4
+
+%$k local 4
+
+%$f0 local 4
+%$f1 local 4
+%$f2 local 4
+%$f3 local 4
+
+%$g0 local 4
+%$g1 local 4
+%$g2 local 4
+%$g3 local 4
+
+%$s1 local 4
+%$c1 local 4
+%$s2 local 4
+%$c2 local 4
+
+%$t_s local 4
+%$t_c local 4
+ alloc
+
+ pushd ebp, ebx, esi, edi
+
+fht_FPU_1st_part:
+
+fht_FPU_2nd_part:
+
+fht_FPU_3rd_part:
+
+.do_init:
+ mov r3, 16 ;k1*fsize = 4*fsize = k4
+ mov r4, 8 ;kx = k1/2
+ mov r2, 48 ;k3*fsize
+ mov dword [sp(%$k)], 2 ;k = 2
+ mov r0, [sp(%$fz)] ;fi
+ lea r1, [r0+8] ;gi = fi + kx
+
+.do:
+.do2:
+ ;f
+ fld dword [r0]
+ fsub dword [r0+r3]
+
+ fld dword [r0]
+ fadd dword [r0+r3]
+
+ fld dword [r0+r3*2]
+ fsub dword [r0+r2]
+
+ fld dword [r0+r3*2]
+ fadd dword [r0+r2] ;f2 f3 f0 f1
+
+ fld st2 ;f0 f2 f3 f0 f1
+ fadd st0, st1
+ fstp dword [r0] ;fi[0]
+
+ fld st3 ;f1 f2 f3 f0 f1
+ fadd st0, st2
+ fstp dword [r0+r3] ;fi[k1]
+
+ fsubr st0, st2 ;f0-f2 f3 f0 f1
+ fstp dword [r0+r3*2] ;fi[k2]
+
+ fsubr st0, st2 ;f1-f3 f0 f1
+ fstp dword [r0+r2] ;fi[k3]
+ fcompp
+
+ ;g
+ fld dword [r1]
+ fsub dword [r1+r3]
+
+ fld dword [r1]
+ fadd dword [r1+r3]
+
+ fld dword [D_1_41421]
+ fmul dword [r1+r2]
+
+ fld dword [D_1_41421]
+ fmul dword [r1+r3*2] ;g2 g3 g0 g1
+
+ fld st2 ;g0 g2 g3 g0 g1
+ fadd st0, st1
+ fstp dword [r1] ;gi[0]
+
+ fld st3 ;g1 g2 g3 g0 g1
+ fadd st0, st2
+ fstp dword [r1+r3] ;gi[k1]
+
+ fsubr st0, st2 ;g0-g2 g3 g0 g1
+ fstp dword [r1+r3*2] ;gi[k2]
+
+ fsubr st0, st2 ;g1-g3 g0 g1
+ fstp dword [r1+r2] ;gi[k3]
+ fcompp
+
+ lea r0, [r0+r3*4]
+ lea r1, [r1+r3*4]
+ cmp r0, r6
+ jb .do2
+
+
+ mov r0, [sp(%$k)]
+ fld dword [costab_fft +r0*4]
+ fstp dword [sp(%$t_c)]
+ fld dword [sintab_fft +r0*4]
+ fstp dword [sp(%$t_s)]
+ fld dword [D_1_0]
+ fstp dword [sp(%$c1)]
+ fld dword [D_0_0]
+ fstp dword [sp(%$s1)]
+
+.for_init:
+ mov r5, 4 ;i = 1*fsize
+
+.for:
+ fld dword [sp(%$c1)]
+ fmul dword [sp(%$t_c)]
+ fld dword [sp(%$s1)]
+ fmul dword [sp(%$t_s)]
+ fsubp st1, st0 ;c1
+
+ fld dword [sp(%$c1)]
+ fmul dword [sp(%$t_s)]
+ fld dword [sp(%$s1)]
+ fmul dword [sp(%$t_c)]
+ faddp st1, st0 ;s1 c1
+
+ fld st1
+ fmul st0, st0 ;c1c1 s1 c1
+ fld st1
+ fmul st0, st0 ;s1s1 c1c1 s1 c1
+ fsubp st1, st0 ;c2 s1 c1
+ fstp dword [sp(%$c2)] ;s1 c1
+
+ fld st1 ;c1 s1 c1
+ fmul st0, st1 ;c1s1 s1 c1
+ fadd st0, st0 ;s2 s1 c1
+ fstp dword [sp(%$s2)] ;s1 c1
+
+ fstp dword [sp(%$s1)] ;c1
+ fstp dword [sp(%$c1)] ;
+
+ mov r0, [sp(%$fz)]
+ add r0, r5 ;r0 = fi
+ mov r1, [sp(%$fz)]
+ add r1, r3
+ sub r1, r5 ;r1 = gi
+
+.do3:
+ fld dword [sp(%$s2)]
+ fmul dword [r0+r3]
+ fld dword [sp(%$c2)]
+ fmul dword [r1+r3]
+ fsubp st1, st0 ;b = s2*fi[k1] - c2*gi[k1]
+
+ fld dword [sp(%$c2)]
+ fmul dword [r0+r3]
+ fld dword [sp(%$s2)]
+ fmul dword [r1+r3]
+ faddp st1, st0 ;a = c2*fi[k1] + s2*gi[k1] b
+
+ fld dword [r0]
+ fsub st0, st1 ;f1 a b
+ fstp dword [sp(%$f1)] ;a b
+
+ fadd dword [r0] ;f0 b
+ fstp dword [sp(%$f0)] ;b
+
+ fld dword [r1]
+ fsub st0, st1 ;g1 b
+ fstp dword [sp(%$g1)] ;b
+
+ fadd dword [r1] ;g0
+ fstp dword [sp(%$g0)] ;
+
+
+ fld dword [sp(%$s2)]
+ fmul dword [r0+r2]
+ fld dword [sp(%$c2)]
+ fmul dword [r1+r2]
+ fsubp st1, st0 ;b = s2*fi[k3] - c2*gi[k3]
+
+ fld dword [sp(%$c2)]
+ fmul dword [r0+r2]
+ fld dword [sp(%$s2)]
+ fmul dword [r1+r2]
+ faddp st1, st0 ;a = c2*fi[k3] + s2*gi[k3] b
+
+ fld dword [r0+r3*2]
+ fsub st0, st1 ;f3 a b
+ fstp dword [sp(%$f3)] ;a b
+
+ fadd dword [r0+r3*2] ;f2 b
+ fstp dword [sp(%$f2)] ;b
+
+ fld dword [r1+r3*2]
+ fsub st0, st1 ;g3 b
+ fstp dword [sp(%$g3)] ;b
+
+ fadd dword [r1+r3*2] ;g2
+ fstp dword [sp(%$g2)] ;
+
+
+ fld dword [sp(%$s1)]
+ fmul dword [sp(%$f2)]
+ fld dword [sp(%$c1)]
+ fmul dword [sp(%$g3)]
+ fsubp st1, st0 ;b = s1*f2 - c1*g3
+
+ fld dword [sp(%$c1)]
+ fmul dword [sp(%$f2)]
+ fld dword [sp(%$s1)]
+ fmul dword [sp(%$g3)]
+ faddp st1, st0 ;a = c1*f2 + s1*g3 b
+
+ fld dword [sp(%$f0)]
+ fsub st0, st1 ;fi[k2] a b
+ fstp dword [r0+r3*2]
+
+ fadd dword [sp(%$f0)] ;fi[0] b
+ fstp dword [r0]
+
+ fld dword [sp(%$g1)]
+ fsub st0, st1 ;gi[k3] b
+ fstp dword [r1+r2]
+
+ fadd dword [sp(%$g1)] ;gi[k1]
+ fstp dword [r1+r3]
+
+
+ fld dword [sp(%$c1)]
+ fmul dword [sp(%$g2)]
+ fld dword [sp(%$s1)]
+ fmul dword [sp(%$f3)]
+ fsubp st1, st0 ;b = c1*g2 - s1*f3
+
+ fld dword [sp(%$s1)]
+ fmul dword [sp(%$g2)]
+ fld dword [sp(%$c1)]
+ fmul dword [sp(%$f3)]
+ faddp st1, st0 ;a = s1*g2 + c1*f3 b
+
+ fld dword [sp(%$g0)]
+ fsub st0, st1 ;gi[k2] a b
+ fstp dword [r1+r3*2]
+
+ fadd dword [sp(%$g0)] ;gi[0] b
+ fstp dword [r1]
+
+ fld dword [sp(%$f1)]
+ fsub st0, st1 ;fi[k3] b
+ fstp dword [r0+r2]
+
+ fadd dword [sp(%$f1)] ;fi[k1]
+ fstp dword [r0+r3]
+
+
+ lea r0, [r0+r3*4]
+ lea r1, [r1+r3*4]
+ cmp r0, r6
+ jb near .do3
+
+ add r5, 4
+ cmp r5, r4
+ jb near .for
+
+ cmp r3, [sp(%$n)]
+ jae .exit
+
+ add dword [sp(%$k)], 2 ;k += 2;
+ lea r3, [r3*4] ;k1 *= 4
+ lea r2, [r2*4] ;k3 *= 4
+ lea r4, [r4*4] ;kx *= 4
+ mov r0, [sp(%$fz)] ;fi
+ lea r1, [r0+r4] ;gi = fi + kx
+ jmp .do
+
+.exit:
+ popd ebp, ebx, esi, edi
+endproc
+
+;*************************************************************
+
+;void fht_FPU_FXCH(float *fz, int n);
+proc fht_FPU_FXCH
+
+%$fz arg 4
+%$n arg 4
+
+%$k local 4
+
+%$f0 local 4
+%$f1 local 4
+%$f2 local 4
+%$f3 local 4
+
+%$g0 local 4
+%$g1 local 4
+%$g2 local 4
+%$g3 local 4
+
+%$s1 local 4
+%$c1 local 4
+%$s2 local 4
+%$c2 local 4
+
+%$t_s local 4
+%$t_c local 4
+ alloc
+
+ pushd ebp, ebx, esi, edi
+
+fht_FPU_FXCH_1st_part:
+
+fht_FPU_FXCH_2nd_part:
+
+fht_FPU_FXCH_3rd_part:
+
+.do_init:
+ mov r3, 16 ;k1*fsize = 4*fsize = k4
+ mov r4, 8 ;kx = k1/2
+ mov r2, 48 ;k3*fsize
+ mov dword [sp(%$k)], 2 ;k = 2
+ mov r0, [sp(%$fz)] ;fi
+ lea r1, [r0+8] ;gi = fi + kx
+
+.do:
+.do2:
+ ;f
+ fld dword [r0]
+ fsub dword [r0+r3]
+ fld dword [r0]
+ fadd dword [r0+r3]
+
+ fld dword [r0+r3*2]
+ fsub dword [r0+r2]
+ fld dword [r0+r3*2]
+ fadd dword [r0+r2] ;f2 f3 f0 f1
+
+ fld st3
+ fld st3
+ fxch st5
+ fadd st0, st3
+ fxch st4
+ fadd st0, st2
+ fxch st3
+ fsubp st1, st0
+ fxch st1
+ fsubp st4, st0
+ fxch st2
+
+ fstp dword [r0+r3] ;fi[k1]
+ fstp dword [r0] ;fi[0]
+ fstp dword [r0+r2] ;fi[k3]
+ fstp dword [r0+r3*2] ;fi[k2]
+
+ ;g
+ fld dword [r1]
+ fsub dword [r1+r3]
+ fld dword [r1]
+ fadd dword [r1+r3]
+
+ fld dword [D_1_41421]
+ fmul dword [r1+r2]
+ fld dword [D_1_41421]
+ fmul dword [r1+r3*2] ;g2 g3 g0 g1
+
+ fld st3
+ fld st3
+ fxch st5
+ fadd st0, st3
+ fxch st4
+ fadd st0, st2
+ fxch st3
+ fsubp st1, st0
+ fxch st1
+ fsubp st4, st0
+ fxch st2
+
+ fstp dword [r1+r3] ;gi[k1]
+ fstp dword [r1] ;gi[0]
+ fstp dword [r1+r2] ;gi[k3]
+ fstp dword [r1+r3*2] ;gi[k2]
+
+ lea r0, [r0+r3*4]
+ lea r1, [r1+r3*4]
+ cmp r0, r6
+ jb .do2
+
+
+ mov r0, [sp(%$k)]
+ fld dword [costab_fft +r0*4]
+ fld dword [sintab_fft +r0*4]
+ fld dword [D_1_0]
+ fld dword [D_0_0]
+ fxch st3
+ fstp dword [sp(%$t_c)]
+ fxch st1
+ fstp dword [sp(%$t_s)]
+ fstp dword [sp(%$c1)]
+ fstp dword [sp(%$s1)]
+
+.for_init:
+ mov r5, 4 ;i = 1*fsize
+
+.for:
+ fld dword [sp(%$c1)]
+ fmul dword [sp(%$t_c)]
+ fld dword [sp(%$s1)]
+ fmul dword [sp(%$t_s)]
+
+ fld dword [sp(%$c1)]
+ fmul dword [sp(%$t_s)]
+ fld dword [sp(%$s1)]
+ fmul dword [sp(%$t_c)]
+ fxch st2
+ fsubp st3, st0 ;c1
+ faddp st1, st0 ;s1 c1
+
+ fld st1
+ fxch st2
+ fmul st0, st0 ;c1c1 s1 c1
+ fld st1
+ fxch st2
+ fmul st0, st0 ;s1s1 c1c1 s1 c1
+
+ fxch st3
+ fst dword [sp(%$c1)] ;c1
+ fxch st2
+ fst dword [sp(%$s1)] ;s1 c1c1 c1 s1s1
+
+ fmulp st2, st0
+ fsubrp st2, st0
+ fadd st0, st0 ;s2 c2
+ fxch st1
+ fstp dword [sp(%$c2)]
+ fstp dword [sp(%$s2)]
+
+ mov r0, [sp(%$fz)]
+ mov r1, [sp(%$fz)]
+ add r0, r5 ;r0 = fi
+ add r1, r3
+ sub r1, r5 ;r1 = gi
+
+.do3:
+ fld dword [sp(%$s2)]
+ fmul dword [r0+r3]
+ fld dword [sp(%$c2)]
+ fmul dword [r1+r3]
+
+ fld dword [sp(%$c2)]
+ fmul dword [r0+r3]
+ fld dword [sp(%$s2)]
+ fmul dword [r1+r3]
+ fxch st2
+ fsubp st3, st0 ;b = s2*fi[k1] - c2*gi[k1]
+ faddp st1, st0 ;a = c2*fi[k1] + s2*gi[k1] b
+
+ fld dword [r1]
+ fsub st0, st2 ;g1 a b
+ fxch st2
+ fadd dword [r1] ;g0 a g1
+
+ fld dword [r0]
+ fsub st0, st2 ;f1 g0 a g1
+ fxch st2
+ fadd dword [r0] ;f0 g0 f1 g1
+
+ fxch st3
+ fstp dword [sp(%$g1)]
+ fstp dword [sp(%$g0)]
+ fstp dword [sp(%$f1)]
+ fstp dword [sp(%$f0)]
+
+
+ fld dword [sp(%$s2)]
+ fmul dword [r0+r2]
+ fld dword [sp(%$c2)]
+ fmul dword [r1+r2]
+
+ fld dword [sp(%$c2)]
+ fmul dword [r0+r2]
+ fld dword [sp(%$s2)]
+ fmul dword [r1+r2]
+ fxch st2
+ fsubp st3, st0 ;b = s2*fi[k3] - c2*gi[k3]
+ faddp st1, st0 ;a = c2*fi[k3] + s2*gi[k3] b
+
+
+ fld dword [r1+r3*2]
+ fsub st0, st2 ;g3 a b
+ fxch st2
+ fadd dword [r1+r3*2] ;g2 a g3
+
+ fld dword [r0+r3*2]
+ fsub st0, st2 ;f3 g2 a g3
+ fxch st2
+ fadd dword [r0+r3*2] ;f2 g2 f3 g3
+
+ fxch st3
+ fstp dword [sp(%$g3)]
+ fstp dword [sp(%$g2)]
+ fstp dword [sp(%$f3)]
+ fstp dword [sp(%$f2)]
+
+
+ fld dword [sp(%$s1)]
+ fmul dword [sp(%$f2)]
+ fld dword [sp(%$c1)]
+ fmul dword [sp(%$g3)]
+
+ fld dword [sp(%$c1)]
+ fmul dword [sp(%$f2)]
+ fld dword [sp(%$s1)]
+ fmul dword [sp(%$g3)]
+ fxch st2
+ fsubp st3, st0 ;b = s1*f2 - c1*g3
+ faddp st1, st0 ;a = c1*f2 + s1*g3 b
+
+ fld dword [sp(%$g1)]
+ fsub st0, st2 ;gi[k3] a b
+ fxch st2
+ fadd dword [sp(%$g1)] ;gi[k1] a gi[k3]
+
+ fld dword [sp(%$f0)]
+ fsub st0, st2 ;fi[k2] gi[k1] a gi[k3]
+ fxch st2
+ fadd dword [sp(%$f0)] ;fi[0] gi[k1] fi[k2] gi[k3]
+
+ fxch st3
+ fstp dword [r1+r2]
+ fstp dword [r1+r3]
+ fstp dword [r0+r3*2]
+ fstp dword [r0]
+
+
+ fld dword [sp(%$c1)]
+ fmul dword [sp(%$g2)]
+ fld dword [sp(%$s1)]
+ fmul dword [sp(%$f3)]
+
+ fld dword [sp(%$s1)]
+ fmul dword [sp(%$g2)]
+ fld dword [sp(%$c1)]
+ fmul dword [sp(%$f3)]
+ fxch st2
+ fsubp st3, st0 ;b = c1*g2 - s1*f3
+ faddp st1, st0 ;a = s1*g2 + c1*f3 b
+
+ fld dword [sp(%$f1)]
+ fsub st0, st2 ;fi[k3] a b
+ fxch st2
+ fadd dword [sp(%$f1)] ;fi[k1] a fi[k3]
+
+ fld dword [sp(%$g0)]
+ fsub st0, st2 ;gi[k2] fi[k1] a fi[k3]
+ fxch st2
+ fadd dword [sp(%$g0)] ;gi[0] fi[k1] gi[k2] fi[k3]
+
+ fxch st3
+ fstp dword [r0+r2]
+ fstp dword [r0+r3]
+ fstp dword [r1+r3*2]
+ fstp dword [r1]
+
+
+ lea r0, [r0+r3*4]
+ lea r1, [r1+r3*4]
+ cmp r0, r6
+ jb near .do3
+
+ add r5, 4
+ cmp r5, r4
+ jb near .for
+
+ cmp r3, [sp(%$n)]
+ jae .exit
+
+ add dword [sp(%$k)], 2 ;k += 2;
+ lea r3, [r3*4] ;k1 *= 4
+ lea r2, [r2*4] ;k3 *= 4
+ lea r4, [r4*4] ;kx *= 4
+ mov r0, [sp(%$fz)] ;fi
+ lea r1, [r0+r4] ;gi = fi + kx
+ jmp .do
+
+.exit:
+ popd ebp, ebx, esi, edi
+endproc
+
+ end