From a6c273927c5bb212e806be6ae10c81dcd81b2152 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Mon, 15 Jul 2013 18:28:17 +0100
Subject: [PATCH 48/49] [ffmpeg] - backport - arm: Add VFP-accelerated version
 of qmf_32_subbands
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

               Before           After
               Mean    StdDev   Mean    StdDev  Change
This function   1323.0  98.0      746.2  60.6   +77.3%
Overall        15400.0 336.4    14147.5 288.4    +8.9%

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 lib/ffmpeg/libavcodec/arm/dcadsp_init_arm.c |   10 +-
 lib/ffmpeg/libavcodec/arm/dcadsp_vfp.S      |  493 +++++++++++++++++++++++++++
 2 files changed, 502 insertions(+), 1 deletion(-)
 create mode 100644 lib/ffmpeg/libavcodec/arm/dcadsp_vfp.S

diff --git a/lib/ffmpeg/libavcodec/arm/dcadsp_init_arm.c b/lib/ffmpeg/libavcodec/arm/dcadsp_init_arm.c
index a1efbff..58267a2 100644
--- a/lib/ffmpeg/libavcodec/arm/dcadsp_init_arm.c
+++ b/lib/ffmpeg/libavcodec/arm/dcadsp_init_arm.c
@@ -26,6 +26,12 @@

 void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
                         int decifactor, float scale);
+void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
+                                SynthFilterContext *synth, FFTContext *imdct,
+                                float synth_buf_ptr[512],
+                                int *synth_buf_offset, float synth_buf2[32],
+                                const float window[512], float *samples_out,
+                                float raXin[32], float scale);
 void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs,
                          int decifactor, float scale);

@@ -33,8 +39,10 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
 {
     int cpu_flags = av_get_cpu_flags();

-    if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags))
+    if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) {
         s->lfe_fir = ff_dca_lfe_fir_vfp;
+        s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp;
+    }
     if (have_neon(cpu_flags))
         s->lfe_fir = ff_dca_lfe_fir_neon;
 }
diff --git a/lib/ffmpeg/libavcodec/arm/dcadsp_vfp.S b/lib/ffmpeg/libavcodec/arm/dcadsp_vfp.S
new file mode 100644
index 0000000..6039e87
--- /dev/null
+++ b/lib/ffmpeg/libavcodec/arm/dcadsp_vfp.S
@@ -0,0 +1,493 @@
+/*
+ * Copyright (c) 2013 RISC OS Open Ltd
+ * Author: Ben Avison <bavison@riscosopen.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+POUT          .req    a1
+PIN           .req    a2
+PCOEF         .req    a3
+DECIFACTOR    .req    a4
+OLDFPSCR      .req    a4
+COUNTER       .req    ip
+
+SCALE32       .req    s28  @ use vector of 4 in place of 9th scalar when decifactor=32 / JMAX=8
+SCALE64       .req    s0   @ spare register in scalar bank when decifactor=64 / JMAX=4
+IN0           .req    s4
+IN1           .req    s5
+IN2           .req    s6
+IN3           .req    s7
+IN4           .req    s0
+IN5           .req    s1
+IN6           .req    s2
+IN7           .req    s3
+COEF0         .req    s8   @ coefficient elements
+COEF1         .req    s9
+COEF2         .req    s10
+COEF3         .req    s11
+COEF4         .req    s12
+COEF5         .req    s13
+COEF6         .req    s14
+COEF7         .req    s15
+ACCUM0        .req    s16  @ double-buffered multiply-accumulate results
+ACCUM4        .req    s20
+POST0         .req    s24  @ do long-latency post-multiply in this vector in parallel
+POST1         .req    s25
+POST2         .req    s26
+POST3         .req    s27
+
+
+.macro inner_loop  decifactor, dir, tail, head
+ .ifc "\dir","up"
+  .set X, 0
+  .set Y, 4
+ .else
+  .set X, 4*JMAX*4 - 4
+  .set Y, -4
+ .endif
+ .ifnc "\head",""
+        vldr    COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
+        vldr    COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
+        vldr    COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
+        vldr    COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
+ .endif
+ .ifnc "\tail",""
+        vadd.f  POST0, ACCUM0, ACCUM4   @ vector operation
+ .endif
+ .ifnc "\head",""
+        vmul.f  ACCUM0, COEF0, IN0      @ vector = vector * scalar
+        vldr    COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
+        vldr    COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
+        vldr    COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
+ .endif
+ .ifnc "\tail",""
+        vmul.f  POST0, POST0, SCALE\decifactor  @ vector operation (SCALE may be scalar)
+ .endif
+ .ifnc "\head",""
+        vldr    COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
+   .ifc "\tail",""
+        vmul.f  ACCUM4, COEF4, IN1      @ vector operation
+   .endif
+        vldr    COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
+        vldr    COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
+   .ifnc "\tail",""
+        vmul.f  ACCUM4, COEF4, IN1      @ vector operation
+   .endif
+        vldr    COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
+        vldr    COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
+ .endif
+ .ifnc "\tail",""
+        vstmia  POUT!, {POST0-POST3}
+ .endif
+ .ifnc "\head",""
+        vmla.f  ACCUM0, COEF0, IN2      @ vector = vector * scalar
+        vldr    COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
+        vldr    COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
+        vldr    COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
+        vldr    COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
+        vmla.f  ACCUM4, COEF4, IN3      @ vector = vector * scalar
+  .if \decifactor == 32
+        vldr    COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
+        vldr    COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
+        vldr    COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
+        vldr    COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
+        vmla.f  ACCUM0, COEF0, IN4      @ vector = vector * scalar
+        vldr    COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
+        vldr    COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
+        vldr    COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
+        vldr    COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
+        vmla.f  ACCUM4, COEF4, IN5      @ vector = vector * scalar
+        vldr    COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
+        vldr    COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
+        vldr    COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
+        vldr    COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
+        vmla.f  ACCUM0, COEF0, IN6      @ vector = vector * scalar
+        vldr    COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
+        vldr    COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
+        vldr    COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
+        vldr    COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
+        vmla.f  ACCUM4, COEF4, IN7      @ vector = vector * scalar
+  .endif
+ .endif
+.endm
+
+.macro dca_lfe_fir  decifactor
+ .if \decifactor == 32
+  .set JMAX, 8
+        vpush   {s16-s31}
+        vmov    SCALE32, s0             @ duplicate scalar across vector
+        vldr    IN4, [PIN, #-4*4]
+        vldr    IN5, [PIN, #-5*4]
+        vldr    IN6, [PIN, #-6*4]
+        vldr    IN7, [PIN, #-7*4]
+ .else
+  .set JMAX, 4
+        vpush   {s16-s27}
+ .endif
+
+        mov     COUNTER, #\decifactor/4 - 1
+        inner_loop  \decifactor, up,, head
+1:      add     PCOEF, PCOEF, #4*JMAX*4
+        subs    COUNTER, COUNTER, #1
+        inner_loop  \decifactor, up, tail, head
+        bne     1b
+        inner_loop  \decifactor, up, tail
+
+        mov     COUNTER, #\decifactor/4 - 1
+        inner_loop  \decifactor, down,, head
+1:      sub     PCOEF, PCOEF, #4*JMAX*4
+        subs    COUNTER, COUNTER, #1
+        inner_loop  \decifactor, down, tail, head
+        bne     1b
+        inner_loop  \decifactor, down, tail
+
+ .if \decifactor == 32
+        vpop    {s16-s31}
+ .else
+        vpop    {s16-s27}
+ .endif
+        fmxr    FPSCR, OLDFPSCR
+        bx      lr
+.endm
+
+
+/* void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
+ *                         int decifactor, float scale)
+ */
+function ff_dca_lfe_fir_vfp, export=1
+        teq     DECIFACTOR, #32
+        fmrx    OLDFPSCR, FPSCR
+        ldr     ip, =0x03030000         @ RunFast mode, short vectors of length 4, stride 1
+        fmxr    FPSCR, ip
+NOVFP   vldr    s0, [sp]
+        vldr    IN0, [PIN, #-0*4]
+        vldr    IN1, [PIN, #-1*4]
+        vldr    IN2, [PIN, #-2*4]
+        vldr    IN3, [PIN, #-3*4]
+        beq     32f
+64:     dca_lfe_fir  64
+ .ltorg
+32:     dca_lfe_fir  32
+endfunc
+
+        .unreq  POUT
+        .unreq  PIN
+        .unreq  PCOEF
+        .unreq  DECIFACTOR
+        .unreq  OLDFPSCR
+        .unreq  COUNTER
+
+        .unreq  SCALE32
+        .unreq  SCALE64
+        .unreq  IN0
+        .unreq  IN1
+        .unreq  IN2
+        .unreq  IN3
+        .unreq  IN4
+        .unreq  IN5
+        .unreq  IN6
+        .unreq  IN7
+        .unreq  COEF0
+        .unreq  COEF1
+        .unreq  COEF2
+        .unreq  COEF3
+        .unreq  COEF4
+        .unreq  COEF5
+        .unreq  COEF6
+        .unreq  COEF7
+        .unreq  ACCUM0
+        .unreq  ACCUM4
+        .unreq  POST0
+        .unreq  POST1
+        .unreq  POST2
+        .unreq  POST3
+
+
+IN      .req    a1
+SBACT   .req    a2
+OLDFPSCR .req   a3
+IMDCT   .req    a4
+WINDOW  .req    v1
+OUT     .req    v2
+BUF     .req    v3
+SCALEINT .req   v4 @ only used in softfp case
+COUNT   .req    v5
+
+SCALE   .req    s0
+
+/* Stack layout differs in softfp and hardfp cases:
+ *
+ * hardfp
+ *      fp -> 6 arg words saved by caller
+ *            a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
+ *            s16-s23 on entry
+ *            align 16
+ *     buf -> 8*32*4 bytes buffer
+ *            s0 on entry
+ *      sp -> 3 arg words for callee
+ *
+ * softfp
+ *      fp -> 7 arg words saved by caller
+ *            a4,v1-v5,fp,lr on entry
+ *            s16-s23 on entry
+ *            align 16
+ *     buf -> 8*32*4 bytes buffer
+ *      sp -> 4 arg words for callee
+ */
+
+/* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
+ *                                 SynthFilterContext *synth, FFTContext *imdct,
+ *                                 float (*synth_buf_ptr)[512],
+ *                                 int *synth_buf_offset, float (*synth_buf2)[32],
+ *                                 const float (*window)[512], float *samples_out,
+ *                                 float (*raXin)[32], float scale);
+ */
+function ff_dca_qmf_32_subbands_vfp, export=1
+VFP     push    {a3-a4,v1-v3,v5,fp,lr}
+NOVFP   push    {a4,v1-v5,fp,lr}
+        add     fp, sp, #8*4
+        vpush   {s16-s23}
+        @ The buffer pointed at by raXin isn't big enough for us to do a
+        @ complete matrix transposition as we want to, so allocate an
+        @ alternative buffer from the stack. Align to 4 words for speed.
+        sub     BUF, sp, #8*32*4
+        bic     BUF, BUF, #15
+        mov     sp, BUF
+        ldr     lr, =0x03330000     @ RunFast mode, short vectors of length 4, stride 2
+        fmrx    OLDFPSCR, FPSCR
+        fmxr    FPSCR, lr
+        @ COUNT is used to count down 2 things at once:
+        @ bits 0-4 are the number of word pairs remaining in the output row
+        @ bits 5-31 are the number of words to copy (with possible negation)
+        @   from the source matrix before we start zeroing the remainder
+        mov     COUNT, #(-4 << 5) + 16
+        adds    COUNT, COUNT, SBACT, lsl #5
+        bmi     2f
+1:
+        vldr    s8,  [IN, #(0*8+0)*4]
+        vldr    s10, [IN, #(0*8+1)*4]
+        vldr    s12, [IN, #(0*8+2)*4]
+        vldr    s14, [IN, #(0*8+3)*4]
+        vldr    s16, [IN, #(0*8+4)*4]
+        vldr    s18, [IN, #(0*8+5)*4]
+        vldr    s20, [IN, #(0*8+6)*4]
+        vldr    s22, [IN, #(0*8+7)*4]
+        vneg.f  s8, s8
+        vldr    s9,  [IN, #(1*8+0)*4]
+        vldr    s11, [IN, #(1*8+1)*4]
+        vldr    s13, [IN, #(1*8+2)*4]
+        vldr    s15, [IN, #(1*8+3)*4]
+        vneg.f  s16, s16
+        vldr    s17, [IN, #(1*8+4)*4]
+        vldr    s19, [IN, #(1*8+5)*4]
+        vldr    s21, [IN, #(1*8+6)*4]
+        vldr    s23, [IN, #(1*8+7)*4]
+        vstr    d4,  [BUF, #(0*32+0)*4]
+        vstr    d5,  [BUF, #(1*32+0)*4]
+        vstr    d6,  [BUF, #(2*32+0)*4]
+        vstr    d7,  [BUF, #(3*32+0)*4]
+        vstr    d8,  [BUF, #(4*32+0)*4]
+        vstr    d9,  [BUF, #(5*32+0)*4]
+        vstr    d10, [BUF, #(6*32+0)*4]
+        vstr    d11, [BUF, #(7*32+0)*4]
+        vldr    s9,  [IN, #(3*8+0)*4]
+        vldr    s11, [IN, #(3*8+1)*4]
+        vldr    s13, [IN, #(3*8+2)*4]
+        vldr    s15, [IN, #(3*8+3)*4]
+        vldr    s17, [IN, #(3*8+4)*4]
+        vldr    s19, [IN, #(3*8+5)*4]
+        vldr    s21, [IN, #(3*8+6)*4]
+        vldr    s23, [IN, #(3*8+7)*4]
+        vneg.f  s9, s9
+        vldr    s8,  [IN, #(2*8+0)*4]
+        vldr    s10, [IN, #(2*8+1)*4]
+        vldr    s12, [IN, #(2*8+2)*4]
+        vldr    s14, [IN, #(2*8+3)*4]
+        vneg.f  s17, s17
+        vldr    s16, [IN, #(2*8+4)*4]
+        vldr    s18, [IN, #(2*8+5)*4]
+        vldr    s20, [IN, #(2*8+6)*4]
+        vldr    s22, [IN, #(2*8+7)*4]
+        vstr    d4,  [BUF, #(0*32+2)*4]
+        vstr    d5,  [BUF, #(1*32+2)*4]
+        vstr    d6,  [BUF, #(2*32+2)*4]
+        vstr    d7,  [BUF, #(3*32+2)*4]
+        vstr    d8,  [BUF, #(4*32+2)*4]
+        vstr    d9,  [BUF, #(5*32+2)*4]
+        vstr    d10, [BUF, #(6*32+2)*4]
+        vstr    d11, [BUF, #(7*32+2)*4]
+        add     IN, IN, #4*8*4
+        add     BUF, BUF, #4*4
+        subs    COUNT, COUNT, #(4 << 5) + 2
+        bpl     1b
+2:      @ Now deal with trailing < 4 samples
+        adds    COUNT, COUNT, #3 << 5
+        bmi     4f  @ sb_act was a multiple of 4
+        bics    lr, COUNT, #0x1F
+        bne     3f
+        @ sb_act was n*4+1
+        vldr    s8,  [IN, #(0*8+0)*4]
+        vldr    s10, [IN, #(0*8+1)*4]
+        vldr    s12, [IN, #(0*8+2)*4]
+        vldr    s14, [IN, #(0*8+3)*4]
+        vldr    s16, [IN, #(0*8+4)*4]
+        vldr    s18, [IN, #(0*8+5)*4]
+        vldr    s20, [IN, #(0*8+6)*4]
+        vldr    s22, [IN, #(0*8+7)*4]
+        vneg.f  s8, s8
+        vldr    s9,  zero
+        vldr    s11, zero
+        vldr    s13, zero
+        vldr    s15, zero
+        vneg.f  s16, s16
+        vldr    s17, zero
+        vldr    s19, zero
+        vldr    s21, zero
+        vldr    s23, zero
+        vstr    d4,  [BUF, #(0*32+0)*4]
+        vstr    d5,  [BUF, #(1*32+0)*4]
+        vstr    d6,  [BUF, #(2*32+0)*4]
+        vstr    d7,  [BUF, #(3*32+0)*4]
+        vstr    d8,  [BUF, #(4*32+0)*4]
+        vstr    d9,  [BUF, #(5*32+0)*4]
+        vstr    d10, [BUF, #(6*32+0)*4]
+        vstr    d11, [BUF, #(7*32+0)*4]
+        add     BUF, BUF, #2*4
+        sub     COUNT, COUNT, #1
+        b       4f
+3:      @ sb_act was n*4+2 or n*4+3, so do the first 2
+        vldr    s8,  [IN, #(0*8+0)*4]
+        vldr    s10, [IN, #(0*8+1)*4]
+        vldr    s12, [IN, #(0*8+2)*4]
+        vldr    s14, [IN, #(0*8+3)*4]
+        vldr    s16, [IN, #(0*8+4)*4]
+        vldr    s18, [IN, #(0*8+5)*4]
+        vldr    s20, [IN, #(0*8+6)*4]
+        vldr    s22, [IN, #(0*8+7)*4]
+        vneg.f  s8, s8
+        vldr    s9,  [IN, #(1*8+0)*4]
+        vldr    s11, [IN, #(1*8+1)*4]
+        vldr    s13, [IN, #(1*8+2)*4]
+        vldr    s15, [IN, #(1*8+3)*4]
+        vneg.f  s16, s16
+        vldr    s17, [IN, #(1*8+4)*4]
+        vldr    s19, [IN, #(1*8+5)*4]
+        vldr    s21, [IN, #(1*8+6)*4]
+        vldr    s23, [IN, #(1*8+7)*4]
+        vstr    d4,  [BUF, #(0*32+0)*4]
+        vstr    d5,  [BUF, #(1*32+0)*4]
+        vstr    d6,  [BUF, #(2*32+0)*4]
+        vstr    d7,  [BUF, #(3*32+0)*4]
+        vstr    d8,  [BUF, #(4*32+0)*4]
+        vstr    d9,  [BUF, #(5*32+0)*4]
+        vstr    d10, [BUF, #(6*32+0)*4]
+        vstr    d11, [BUF, #(7*32+0)*4]
+        add     BUF, BUF, #2*4
+        sub     COUNT, COUNT, #(2 << 5) + 1
+        bics    lr, COUNT, #0x1F
+        bne     4f
+        @ sb_act was n*4+3
+        vldr    s8,  [IN, #(2*8+0)*4]
+        vldr    s10, [IN, #(2*8+1)*4]
+        vldr    s12, [IN, #(2*8+2)*4]
+        vldr    s14, [IN, #(2*8+3)*4]
+        vldr    s16, [IN, #(2*8+4)*4]
+        vldr    s18, [IN, #(2*8+5)*4]
+        vldr    s20, [IN, #(2*8+6)*4]
+        vldr    s22, [IN, #(2*8+7)*4]
+        vldr    s9,  zero
+        vldr    s11, zero
+        vldr    s13, zero
+        vldr    s15, zero
+        vldr    s17, zero
+        vldr    s19, zero
+        vldr    s21, zero
+        vldr    s23, zero
+        vstr    d4,  [BUF, #(0*32+0)*4]
+        vstr    d5,  [BUF, #(1*32+0)*4]
+        vstr    d6,  [BUF, #(2*32+0)*4]
+        vstr    d7,  [BUF, #(3*32+0)*4]
+        vstr    d8,  [BUF, #(4*32+0)*4]
+        vstr    d9,  [BUF, #(5*32+0)*4]
+        vstr    d10, [BUF, #(6*32+0)*4]
+        vstr    d11, [BUF, #(7*32+0)*4]
+        add     BUF, BUF, #2*4
+        sub     COUNT, COUNT, #1
+4:      @ Now fill the remainder with 0
+        vldr    s8, zero
+        vldr    s9, zero
+        ands    COUNT, COUNT, #0x1F
+        beq     6f
+5:      vstr    d4, [BUF, #(0*32+0)*4]
+        vstr    d4, [BUF, #(1*32+0)*4]
+        vstr    d4, [BUF, #(2*32+0)*4]
+        vstr    d4, [BUF, #(3*32+0)*4]
+        vstr    d4, [BUF, #(4*32+0)*4]
+        vstr    d4, [BUF, #(5*32+0)*4]
+        vstr    d4, [BUF, #(6*32+0)*4]
+        vstr    d4, [BUF, #(7*32+0)*4]
+        add     BUF, BUF, #2*4
+        subs    COUNT, COUNT, #1
+        bne     5b
+6:
+        fmxr    FPSCR, OLDFPSCR
+        ldr     WINDOW, [fp, #3*4]
+        ldr     OUT, [fp, #4*4]
+        sub     BUF, BUF, #32*4
+NOVFP   ldr     SCALEINT, [fp, #6*4]
+        mov     COUNT, #8
+VFP     vpush   {SCALE}
+VFP     sub     sp, sp, #3*4
+NOVFP   sub     sp, sp, #4*4
+7:
+VFP     ldr     a1, [fp, #-7*4]     @ imdct
+NOVFP   ldr     a1, [fp, #-8*4]
+        ldmia   fp, {a2-a4}
+VFP     stmia   sp, {WINDOW, OUT, BUF}
+NOVFP   stmia   sp, {WINDOW, OUT, BUF, SCALEINT}
+VFP     vldr    SCALE, [sp, #3*4]
+        bl      ff_synth_filter_float_vfp
+        add     OUT, OUT, #32*4
+        add     BUF, BUF, #32*4
+        subs    COUNT, COUNT, #1
+        bne     7b
+
+A       sub     sp, fp, #(8+8)*4
+T       sub     fp, fp, #(8+8)*4
+T       mov     sp, fp
+        vpop    {s16-s23}
+VFP     pop     {a3-a4,v1-v3,v5,fp,pc}
+NOVFP   pop     {a4,v1-v5,fp,pc}
+endfunc
+
+        .unreq  IN
+        .unreq  SBACT
+        .unreq  OLDFPSCR
+        .unreq  IMDCT
+        .unreq  WINDOW
+        .unreq  OUT
+        .unreq  BUF
+        .unreq  SCALEINT
+        .unreq  COUNT
+
+        .unreq  SCALE
+
+        .align 2
+zero:   .word   0
--
1.7.9.5