diff options
Diffstat (limited to 'lib/liblame/libmp3lame/i386/fftfpu.nas')
-rw-r--r-- | lib/liblame/libmp3lame/i386/fftfpu.nas | 619 |
1 files changed, 619 insertions, 0 deletions
diff --git a/lib/liblame/libmp3lame/i386/fftfpu.nas b/lib/liblame/libmp3lame/i386/fftfpu.nas new file mode 100644 index 0000000000..2ae89bd799 --- /dev/null +++ b/lib/liblame/libmp3lame/i386/fftfpu.nas @@ -0,0 +1,619 @@ +; back port from GOGO-no coda 2.24b by Takehiro TOMINAGA + +; GOGO-no-coda +; Copyright (C) 1999 shigeo +; special thanks to URURI + +%include "nasm.h" + + externdef costab_fft + externdef sintab_fft + + segment_data + align 32 +D_1_41421 dd 1.41421356 +D_1_0 dd 1.0 +D_0_5 dd 0.5 +D_0_25 dd 0.25 +D_0_0005 dd 0.0005 +D_0_0 dd 0.0 + + segment_code + +;void fht(float *fz, int n); +proc fht_FPU + +%$fz arg 4 +%$n arg 4 + +%$k local 4 + +%$f0 local 4 +%$f1 local 4 +%$f2 local 4 +%$f3 local 4 + +%$g0 local 4 +%$g1 local 4 +%$g2 local 4 +%$g3 local 4 + +%$s1 local 4 +%$c1 local 4 +%$s2 local 4 +%$c2 local 4 + +%$t_s local 4 +%$t_c local 4 + alloc + + pushd ebp, ebx, esi, edi + +fht_FPU_1st_part: + +fht_FPU_2nd_part: + +fht_FPU_3rd_part: + +.do_init: + mov r3, 16 ;k1*fsize = 4*fsize = k4 + mov r4, 8 ;kx = k1/2 + mov r2, 48 ;k3*fsize + mov dword [sp(%$k)], 2 ;k = 2 + mov r0, [sp(%$fz)] ;fi + lea r1, [r0+8] ;gi = fi + kx + +.do: +.do2: + ;f + fld dword [r0] + fsub dword [r0+r3] + + fld dword [r0] + fadd dword [r0+r3] + + fld dword [r0+r3*2] + fsub dword [r0+r2] + + fld dword [r0+r3*2] + fadd dword [r0+r2] ;f2 f3 f0 f1 + + fld st2 ;f0 f2 f3 f0 f1 + fadd st0, st1 + fstp dword [r0] ;fi[0] + + fld st3 ;f1 f2 f3 f0 f1 + fadd st0, st2 + fstp dword [r0+r3] ;fi[k1] + + fsubr st0, st2 ;f0-f2 f3 f0 f1 + fstp dword [r0+r3*2] ;fi[k2] + + fsubr st0, st2 ;f1-f3 f0 f1 + fstp dword [r0+r2] ;fi[k3] + fcompp + + ;g + fld dword [r1] + fsub dword [r1+r3] + + fld dword [r1] + fadd dword [r1+r3] + + fld dword [D_1_41421] + fmul dword [r1+r2] + + fld dword [D_1_41421] + fmul dword [r1+r3*2] ;g2 g3 g0 g1 + + fld st2 ;g0 g2 g3 g0 g1 + fadd st0, st1 + fstp dword [r1] ;gi[0] + + fld st3 ;g1 g2 g3 g0 g1 + fadd st0, st2 + fstp dword [r1+r3] ;gi[k1] + + fsubr st0, st2 ;g0-g2 g3 g0 g1 + fstp dword [r1+r3*2] ;gi[k2] + + fsubr st0, st2 ;g1-g3 g0 g1 + fstp dword [r1+r2] ;gi[k3] + fcompp + + lea r0, [r0+r3*4] + lea r1, [r1+r3*4] + cmp r0, r6 + jb .do2 + + + mov r0, [sp(%$k)] + fld dword [costab_fft +r0*4] + fstp dword [sp(%$t_c)] + fld dword [sintab_fft +r0*4] + fstp dword [sp(%$t_s)] + fld dword [D_1_0] + fstp dword [sp(%$c1)] + fld dword [D_0_0] + fstp dword [sp(%$s1)] + +.for_init: + mov r5, 4 ;i = 1*fsize + +.for: + fld dword [sp(%$c1)] + fmul dword [sp(%$t_c)] + fld dword [sp(%$s1)] + fmul dword [sp(%$t_s)] + fsubp st1, st0 ;c1 + + fld dword [sp(%$c1)] + fmul dword [sp(%$t_s)] + fld dword [sp(%$s1)] + fmul dword [sp(%$t_c)] + faddp st1, st0 ;s1 c1 + + fld st1 + fmul st0, st0 ;c1c1 s1 c1 + fld st1 + fmul st0, st0 ;s1s1 c1c1 s1 c1 + fsubp st1, st0 ;c2 s1 c1 + fstp dword [sp(%$c2)] ;s1 c1 + + fld st1 ;c1 s1 c1 + fmul st0, st1 ;c1s1 s1 c1 + fadd st0, st0 ;s2 s1 c1 + fstp dword [sp(%$s2)] ;s1 c1 + + fstp dword [sp(%$s1)] ;c1 + fstp dword [sp(%$c1)] ; + + mov r0, [sp(%$fz)] + add r0, r5 ;r0 = fi + mov r1, [sp(%$fz)] + add r1, r3 + sub r1, r5 ;r1 = gi + +.do3: + fld dword [sp(%$s2)] + fmul dword [r0+r3] + fld dword [sp(%$c2)] + fmul dword [r1+r3] + fsubp st1, st0 ;b = s2*fi[k1] - c2*gi[k1] + + fld dword [sp(%$c2)] + fmul dword [r0+r3] + fld dword [sp(%$s2)] + fmul dword [r1+r3] + faddp st1, st0 ;a = c2*fi[k1] + s2*gi[k1] b + + fld dword [r0] + fsub st0, st1 ;f1 a b + fstp dword [sp(%$f1)] ;a b + + fadd dword [r0] ;f0 b + fstp dword [sp(%$f0)] ;b + + fld dword [r1] + fsub st0, st1 ;g1 b + fstp dword [sp(%$g1)] ;b + + fadd dword [r1] ;g0 + fstp dword [sp(%$g0)] ; + + + fld dword [sp(%$s2)] + fmul dword [r0+r2] + fld dword [sp(%$c2)] + fmul dword [r1+r2] + fsubp st1, st0 ;b = s2*fi[k3] - c2*gi[k3] + + fld dword [sp(%$c2)] + fmul dword [r0+r2] + fld dword [sp(%$s2)] + fmul dword [r1+r2] + faddp st1, st0 ;a = c2*fi[k3] + s2*gi[k3] b + + fld dword [r0+r3*2] + fsub st0, st1 ;f3 a b + fstp dword [sp(%$f3)] ;a b + + fadd dword [r0+r3*2] ;f2 b + fstp dword [sp(%$f2)] ;b + + fld dword [r1+r3*2] + fsub st0, st1 ;g3 b + fstp dword [sp(%$g3)] ;b + + fadd dword [r1+r3*2] ;g2 + fstp dword [sp(%$g2)] ; + + + fld dword [sp(%$s1)] + fmul dword [sp(%$f2)] + fld dword [sp(%$c1)] + fmul dword [sp(%$g3)] + fsubp st1, st0 ;b = s1*f2 - c1*g3 + + fld dword [sp(%$c1)] + fmul dword [sp(%$f2)] + fld dword [sp(%$s1)] + fmul dword [sp(%$g3)] + faddp st1, st0 ;a = c1*f2 + s1*g3 b + + fld dword [sp(%$f0)] + fsub st0, st1 ;fi[k2] a b + fstp dword [r0+r3*2] + + fadd dword [sp(%$f0)] ;fi[0] b + fstp dword [r0] + + fld dword [sp(%$g1)] + fsub st0, st1 ;gi[k3] b + fstp dword [r1+r2] + + fadd dword [sp(%$g1)] ;gi[k1] + fstp dword [r1+r3] + + + fld dword [sp(%$c1)] + fmul dword [sp(%$g2)] + fld dword [sp(%$s1)] + fmul dword [sp(%$f3)] + fsubp st1, st0 ;b = c1*g2 - s1*f3 + + fld dword [sp(%$s1)] + fmul dword [sp(%$g2)] + fld dword [sp(%$c1)] + fmul dword [sp(%$f3)] + faddp st1, st0 ;a = s1*g2 + c1*f3 b + + fld dword [sp(%$g0)] + fsub st0, st1 ;gi[k2] a b + fstp dword [r1+r3*2] + + fadd dword [sp(%$g0)] ;gi[0] b + fstp dword [r1] + + fld dword [sp(%$f1)] + fsub st0, st1 ;fi[k3] b + fstp dword [r0+r2] + + fadd dword [sp(%$f1)] ;fi[k1] + fstp dword [r0+r3] + + + lea r0, [r0+r3*4] + lea r1, [r1+r3*4] + cmp r0, r6 + jb near .do3 + + add r5, 4 + cmp r5, r4 + jb near .for + + cmp r3, [sp(%$n)] + jae .exit + + add dword [sp(%$k)], 2 ;k += 2; + lea r3, [r3*4] ;k1 *= 4 + lea r2, [r2*4] ;k3 *= 4 + lea r4, [r4*4] ;kx *= 4 + mov r0, [sp(%$fz)] ;fi + lea r1, [r0+r4] ;gi = fi + kx + jmp .do + +.exit: + popd ebp, ebx, esi, edi +endproc + +;************************************************************* + +;void fht_FPU_FXCH(float *fz, int n); +proc fht_FPU_FXCH + +%$fz arg 4 +%$n arg 4 + +%$k local 4 + +%$f0 local 4 +%$f1 local 4 +%$f2 local 4 +%$f3 local 4 + +%$g0 local 4 +%$g1 local 4 +%$g2 local 4 +%$g3 local 4 + +%$s1 local 4 +%$c1 local 4 +%$s2 local 4 +%$c2 local 4 + +%$t_s local 4 +%$t_c local 4 + alloc + + pushd ebp, ebx, esi, edi + +fht_FPU_FXCH_1st_part: + +fht_FPU_FXCH_2nd_part: + +fht_FPU_FXCH_3rd_part: + +.do_init: + mov r3, 16 ;k1*fsize = 4*fsize = k4 + mov r4, 8 ;kx = k1/2 + mov r2, 48 ;k3*fsize + mov dword [sp(%$k)], 2 ;k = 2 + mov r0, [sp(%$fz)] ;fi + lea r1, [r0+8] ;gi = fi + kx + +.do: +.do2: + ;f + fld dword [r0] + fsub dword [r0+r3] + fld dword [r0] + fadd dword [r0+r3] + + fld dword [r0+r3*2] + fsub dword [r0+r2] + fld dword [r0+r3*2] + fadd dword [r0+r2] ;f2 f3 f0 f1 + + fld st3 + fld st3 + fxch st5 + fadd st0, st3 + fxch st4 + fadd st0, st2 + fxch st3 + fsubp st1, st0 + fxch st1 + fsubp st4, st0 + fxch st2 + + fstp dword [r0+r3] ;fi[k1] + fstp dword [r0] ;fi[0] + fstp dword [r0+r2] ;fi[k3] + fstp dword [r0+r3*2] ;fi[k2] + + ;g + fld dword [r1] + fsub dword [r1+r3] + fld dword [r1] + fadd dword [r1+r3] + + fld dword [D_1_41421] + fmul dword [r1+r2] + fld dword [D_1_41421] + fmul dword [r1+r3*2] ;g2 g3 g0 g1 + + fld st3 + fld st3 + fxch st5 + fadd st0, st3 + fxch st4 + fadd st0, st2 + fxch st3 + fsubp st1, st0 + fxch st1 + fsubp st4, st0 + fxch st2 + + fstp dword [r1+r3] ;gi[k1] + fstp dword [r1] ;gi[0] + fstp dword [r1+r2] ;gi[k3] + fstp dword [r1+r3*2] ;gi[k2] + + lea r0, [r0+r3*4] + lea r1, [r1+r3*4] + cmp r0, r6 + jb .do2 + + + mov r0, [sp(%$k)] + fld dword [costab_fft +r0*4] + fld dword [sintab_fft +r0*4] + fld dword [D_1_0] + fld dword [D_0_0] + fxch st3 + fstp dword [sp(%$t_c)] + fxch st1 + fstp dword [sp(%$t_s)] + fstp dword [sp(%$c1)] + fstp dword [sp(%$s1)] + +.for_init: + mov r5, 4 ;i = 1*fsize + +.for: + fld dword [sp(%$c1)] + fmul dword [sp(%$t_c)] + fld dword [sp(%$s1)] + fmul dword [sp(%$t_s)] + + fld dword [sp(%$c1)] + fmul dword [sp(%$t_s)] + fld dword [sp(%$s1)] + fmul dword [sp(%$t_c)] + fxch st2 + fsubp st3, st0 ;c1 + faddp st1, st0 ;s1 c1 + + fld st1 + fxch st2 + fmul st0, st0 ;c1c1 s1 c1 + fld st1 + fxch st2 + fmul st0, st0 ;s1s1 c1c1 s1 c1 + + fxch st3 + fst dword [sp(%$c1)] ;c1 + fxch st2 + fst dword [sp(%$s1)] ;s1 c1c1 c1 s1s1 + + fmulp st2, st0 + fsubrp st2, st0 + fadd st0, st0 ;s2 c2 + fxch st1 + fstp dword [sp(%$c2)] + fstp dword [sp(%$s2)] + + mov r0, [sp(%$fz)] + mov r1, [sp(%$fz)] + add r0, r5 ;r0 = fi + add r1, r3 + sub r1, r5 ;r1 = gi + +.do3: + fld dword [sp(%$s2)] + fmul dword [r0+r3] + fld dword [sp(%$c2)] + fmul dword [r1+r3] + + fld dword [sp(%$c2)] + fmul dword [r0+r3] + fld dword [sp(%$s2)] + fmul dword [r1+r3] + fxch st2 + fsubp st3, st0 ;b = s2*fi[k1] - c2*gi[k1] + faddp st1, st0 ;a = c2*fi[k1] + s2*gi[k1] b + + fld dword [r1] + fsub st0, st2 ;g1 a b + fxch st2 + fadd dword [r1] ;g0 a g1 + + fld dword [r0] + fsub st0, st2 ;f1 g0 a g1 + fxch st2 + fadd dword [r0] ;f0 g0 f1 g1 + + fxch st3 + fstp dword [sp(%$g1)] + fstp dword [sp(%$g0)] + fstp dword [sp(%$f1)] + fstp dword [sp(%$f0)] + + + fld dword [sp(%$s2)] + fmul dword [r0+r2] + fld dword [sp(%$c2)] + fmul dword [r1+r2] + + fld dword [sp(%$c2)] + fmul dword [r0+r2] + fld dword [sp(%$s2)] + fmul dword [r1+r2] + fxch st2 + fsubp st3, st0 ;b = s2*fi[k3] - c2*gi[k3] + faddp st1, st0 ;a = c2*fi[k3] + s2*gi[k3] b + + + fld dword [r1+r3*2] + fsub st0, st2 ;g3 a b + fxch st2 + fadd dword [r1+r3*2] ;g2 a g3 + + fld dword [r0+r3*2] + fsub st0, st2 ;f3 g2 a g3 + fxch st2 + fadd dword [r0+r3*2] ;f2 g2 f3 g3 + + fxch st3 + fstp dword [sp(%$g3)] + fstp dword [sp(%$g2)] + fstp dword [sp(%$f3)] + fstp dword [sp(%$f2)] + + + fld dword [sp(%$s1)] + fmul dword [sp(%$f2)] + fld dword [sp(%$c1)] + fmul dword [sp(%$g3)] + + fld dword [sp(%$c1)] + fmul dword [sp(%$f2)] + fld dword [sp(%$s1)] + fmul dword [sp(%$g3)] + fxch st2 + fsubp st3, st0 ;b = s1*f2 - c1*g3 + faddp st1, st0 ;a = c1*f2 + s1*g3 b + + fld dword [sp(%$g1)] + fsub st0, st2 ;gi[k3] a b + fxch st2 + fadd dword [sp(%$g1)] ;gi[k1] a gi[k3] + + fld dword [sp(%$f0)] + fsub st0, st2 ;fi[k2] gi[k1] a gi[k3] + fxch st2 + fadd dword [sp(%$f0)] ;fi[0] gi[k1] fi[k2] gi[k3] + + fxch st3 + fstp dword [r1+r2] + fstp dword [r1+r3] + fstp dword [r0+r3*2] + fstp dword [r0] + + + fld dword [sp(%$c1)] + fmul dword [sp(%$g2)] + fld dword [sp(%$s1)] + fmul dword [sp(%$f3)] + + fld dword [sp(%$s1)] + fmul dword [sp(%$g2)] + fld dword [sp(%$c1)] + fmul dword [sp(%$f3)] + fxch st2 + fsubp st3, st0 ;b = c1*g2 - s1*f3 + faddp st1, st0 ;a = s1*g2 + c1*f3 b + + fld dword [sp(%$f1)] + fsub st0, st2 ;fi[k3] a b + fxch st2 + fadd dword [sp(%$f1)] ;fi[k1] a fi[k3] + + fld dword [sp(%$g0)] + fsub st0, st2 ;gi[k2] fi[k1] a fi[k3] + fxch st2 + fadd dword [sp(%$g0)] ;gi[0] fi[k1] gi[k2] fi[k3] + + fxch st3 + fstp dword [r0+r2] + fstp dword [r0+r3] + fstp dword [r1+r3*2] + fstp dword [r1] + + + lea r0, [r0+r3*4] + lea r1, [r1+r3*4] + cmp r0, r6 + jb near .do3 + + add r5, 4 + cmp r5, r4 + jb near .for + + cmp r3, [sp(%$n)] + jae .exit + + add dword [sp(%$k)], 2 ;k += 2; + lea r3, [r3*4] ;k1 *= 4 + lea r2, [r2*4] ;k3 *= 4 + lea r4, [r4*4] ;kx *= 4 + mov r0, [sp(%$fz)] ;fi + lea r1, [r0+r4] ;gi = fi + kx + jmp .do + +.exit: + popd ebp, ebx, esi, edi +endproc + + end |