1 files changed, 311 insertions, 0 deletions
diff --git a/lib/ffmpeg/patches/0040-ffmpeg-backport-arm-Add-VFP-accelerated-version-of-s.patch b/lib/ffmpeg/patches/0040-ffmpeg-backport-arm-Add-VFP-accelerated-version-of-s.patch
new file mode 100644
index 0000000000..d221f29a86
--- /dev/null
+++ b/lib/ffmpeg/patches/0040-ffmpeg-backport-arm-Add-VFP-accelerated-version-of-s.patch
@@ -0,0 +1,311 @@
+From 40daea3c1bafa9cea37b65f856c3c0432767d760 Mon Sep 17 00:00:00 2001
+From: Ben Avison <bavison@riscosopen.org>
+Date: Mon, 15 Jul 2013 18:28:09 +0100
+Subject: [PATCH 39/49] [ffmpeg] - backport - arm: Add VFP-accelerated version
+ of synth_filter_float
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+               Before           After
+               Mean    StdDev   Mean    StdDev  Change
+This function   9295.0 114.9     4853.2 83.5    +91.5%
+Overall        23699.8 397.6    19285.5 292.0   +22.9%
+
+Signed-off-by: Martin Storsjö <martin@martin.st>
+---
+ lib/ffmpeg/libavcodec/arm/Makefile           |    1 +
+ lib/ffmpeg/libavcodec/arm/fft_init_arm.c     |    8 +
+ lib/ffmpeg/libavcodec/arm/synth_filter_vfp.S |  243 ++++++++++++++++++++++++++
+ 3 files changed, 252 insertions(+)
+ create mode 100644 lib/ffmpeg/libavcodec/arm/synth_filter_vfp.S
+
+diff --git a/lib/ffmpeg/libavcodec/arm/Makefile b/lib/ffmpeg/libavcodec/arm/Makefile
+index 1c91d62..aee9d73 100644
+--- a/lib/ffmpeg/libavcodec/arm/Makefile
++++ b/lib/ffmpeg/libavcodec/arm/Makefile
+@@ -58,6 +58,7 @@ ARMV6-OBJS                             += arm/dsputil_init_armv6.o      \
+                                           arm/dsputil_armv6.o           \
+                                           arm/simple_idct_armv6.o       \
+
++VFP-OBJS-$(CONFIG_DCA_DECODER)         += arm/synth_filter_vfp.o
+ VFP-OBJS-$(HAVE_ARMV6)                 += arm/fmtconvert_vfp.o
+
+ NEON-OBJS-$(CONFIG_FFT)                += arm/fft_neon.o                \
+diff --git a/lib/ffmpeg/libavcodec/arm/fft_init_arm.c b/lib/ffmpeg/libavcodec/arm/fft_init_arm.c
+index 8c98abc..fe0acc5 100644
+--- a/lib/ffmpeg/libavcodec/arm/fft_init_arm.c
++++ b/lib/ffmpeg/libavcodec/arm/fft_init_arm.c
+@@ -32,6 +32,12 @@ void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input)
+
+ void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z);
+
++void ff_synth_filter_float_vfp(FFTContext *imdct,
++                               float *synth_buf_ptr, int *synth_buf_offset,
++                               float synth_buf2[32], const float window[512],
++                               float out[32], const float in[32],
++                               float scale);
++
+ void ff_synth_filter_float_neon(FFTContext *imdct,
+                                 float *synth_buf_ptr, int *synth_buf_offset,
+                                 float synth_buf2[32], const float window[512],
+@@ -71,6 +77,8 @@ av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
+ {
+     int cpu_flags = av_get_cpu_flags();
+
++    if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags))
++        s->synth_filter_float = ff_synth_filter_float_vfp;
+     if (have_neon(cpu_flags))
+         s->synth_filter_float = ff_synth_filter_float_neon;
+ }
+diff --git a/lib/ffmpeg/libavcodec/arm/synth_filter_vfp.S b/lib/ffmpeg/libavcodec/arm/synth_filter_vfp.S
+new file mode 100644
+index 0000000..c219c41
+--- /dev/null
++++ b/lib/ffmpeg/libavcodec/arm/synth_filter_vfp.S
+@@ -0,0 +1,243 @@
++/*
++ * Copyright (c) 2013 RISC OS Open Ltd
++ * Author: Ben Avison <bavison@riscosopen.org>
++ *
++ * This file is part of Libav.
++ *
++ * Libav is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * Libav is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with Libav; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/arm/asm.S"
++
++IMDCT         .req    r0
++ORIG_P_SB     .req    r1
++P_SB_OFF      .req    r2
++I             .req    r0
++P_SB2_UP      .req    r1
++OLDFPSCR      .req    r2
++P_SB2_DN      .req    r3
++P_WIN_DN      .req    r4
++P_OUT_DN      .req    r5
++P_SB          .req    r6
++J_WRAP        .req    r7
++P_WIN_UP      .req    r12
++P_OUT_UP      .req    r14
++
++SCALE         .req    s0
++SBUF_DAT_REV0 .req    s4
++SBUF_DAT_REV1 .req    s5
++SBUF_DAT_REV2 .req    s6
++SBUF_DAT_REV3 .req    s7
++VA0           .req    s8
++VA3           .req    s11
++VB0           .req    s12
++VB3           .req    s15
++VC0           .req    s8
++VC3           .req    s11
++VD0           .req    s12
++VD3           .req    s15
++SBUF_DAT0     .req    s16
++SBUF_DAT1     .req    s17
++SBUF_DAT2     .req    s18
++SBUF_DAT3     .req    s19
++SBUF_DAT_ALT0 .req    s20
++SBUF_DAT_ALT1 .req    s21
++SBUF_DAT_ALT2 .req    s22
++SBUF_DAT_ALT3 .req    s23
++WIN_DN_DAT0   .req    s24
++WIN_UP_DAT0   .req    s28
++
++
++.macro inner_loop  half, tail, head
++ .if (OFFSET & (64*4)) == 0                @ even numbered call
++        SBUF_DAT_THIS0 .req SBUF_DAT0
++        SBUF_DAT_THIS1 .req SBUF_DAT1
++        SBUF_DAT_THIS2 .req SBUF_DAT2
++        SBUF_DAT_THIS3 .req SBUF_DAT3
++  .ifnc "\head",""
++        vldr    d8, [P_SB, #OFFSET]        @ d8 = SBUF_DAT
++        vldr    d9, [P_SB, #OFFSET+8]
++  .endif
++ .else
++        SBUF_DAT_THIS0 .req SBUF_DAT_ALT0
++        SBUF_DAT_THIS1 .req SBUF_DAT_ALT1
++        SBUF_DAT_THIS2 .req SBUF_DAT_ALT2
++        SBUF_DAT_THIS3 .req SBUF_DAT_ALT3
++  .ifnc "\head",""
++        vldr    d10, [P_SB, #OFFSET]       @ d10 = SBUF_DAT_ALT
++        vldr    d11, [P_SB, #OFFSET+8]
++  .endif
++ .endif
++ .ifnc "\tail",""
++  .ifc "\half","ab"
++        vmls.f  VA0, SBUF_DAT_REV0, WIN_DN_DAT0  @ all operands treated as vectors
++  .else
++        vmla.f  VD0, SBUF_DAT_REV0, WIN_DN_DAT0  @ all operands treated as vectors
++  .endif
++ .endif
++ .ifnc "\head",""
++        vldr    d14, [P_WIN_UP, #OFFSET]   @ d14 = WIN_UP_DAT
++        vldr    d15, [P_WIN_UP, #OFFSET+8]
++        vldr    d12, [P_WIN_DN, #OFFSET]   @ d12 = WIN_DN_DAT
++        vldr    d13, [P_WIN_DN, #OFFSET+8]
++        vmov    SBUF_DAT_REV3, SBUF_DAT_THIS0
++        vmov    SBUF_DAT_REV2, SBUF_DAT_THIS1
++        vmov    SBUF_DAT_REV1, SBUF_DAT_THIS2
++        vmov    SBUF_DAT_REV0, SBUF_DAT_THIS3
++  .ifc "\half","ab"
++        vmla.f  VB0, SBUF_DAT_THIS0, WIN_UP_DAT0
++  .else
++        vmla.f  VC0, SBUF_DAT_THIS0, WIN_UP_DAT0
++  .endif
++        teq     J_WRAP, #J
++        bne     2f             @ strongly predictable, so better than cond exec in this case
++        sub     P_SB, P_SB, #512*4
++2:
++  .set J, J - 64
++  .set OFFSET, OFFSET + 64*4
++ .endif
++        .unreq  SBUF_DAT_THIS0
++        .unreq  SBUF_DAT_THIS1
++        .unreq  SBUF_DAT_THIS2
++        .unreq  SBUF_DAT_THIS3
++.endm
++
++
++/* void ff_synth_filter_float_vfp(FFTContext *imdct,
++ *                                float *synth_buf_ptr, int *synth_buf_offset,
++ *                                float synth_buf2[32], const float window[512],
++ *                                float out[32], const float in[32], float scale)
++ */
++function ff_synth_filter_float_vfp, export=1
++        push    {r3-r7,lr}
++        vpush   {s16-s31}
++        ldr     lr, [P_SB_OFF]
++        add     a2, ORIG_P_SB, lr, LSL #2 @ calculate synth_buf to pass to imdct_half
++        mov     P_SB, a2                  @ and keep a copy for ourselves
++        bic     J_WRAP, lr, #63           @ mangled to make testing for wrap easier in inner loop
++        sub     lr, lr, #32
++        and     lr, lr, #512-32
++        str     lr, [P_SB_OFF]            @ rotate offset, modulo buffer size, ready for next call
++        ldr     a3, [sp, #(16+6+2)*4]     @ fetch in from stack, to pass to imdct_half
++VFP     vmov    s16, SCALE                @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case
++        bl      ff_imdct_half_vfp
++VFP     vmov    SCALE, s16
++
++        fmrx    OLDFPSCR, FPSCR
++        ldr     lr, =0x03030000           @ RunFast mode, short vectors of length 4, stride 1
++        fmxr    FPSCR, lr
++        ldr     P_SB2_DN, [sp, #16*4]
++        ldr     P_WIN_DN, [sp, #(16+6+0)*4]
++        ldr     P_OUT_DN, [sp, #(16+6+1)*4]
++NOVFP   vldr    SCALE, [sp, #(16+6+3)*4]
++
++#define IMM_OFF_SKEW 956                   /* also valid immediate constant when you add 16*4 */
++        add     P_SB, P_SB, #IMM_OFF_SKEW  @ so we can use -ve offsets to use full immediate offset range
++        add     P_SB2_UP, P_SB2_DN, #16*4
++        add     P_WIN_UP, P_WIN_DN, #16*4+IMM_OFF_SKEW
++        add     P_OUT_UP, P_OUT_DN, #16*4
++        add     P_SB2_DN, P_SB2_DN, #16*4
++        add     P_WIN_DN, P_WIN_DN, #12*4+IMM_OFF_SKEW
++        add     P_OUT_DN, P_OUT_DN, #16*4
++        mov     I, #4
++1:
++        vldmia  P_SB2_UP!, {VB0-VB3}
++        vldmdb  P_SB2_DN!, {VA0-VA3}
++ .set J, 512 - 64
++ .set OFFSET, -IMM_OFF_SKEW
++        inner_loop  ab,, head
++ .rept 7
++        inner_loop  ab, tail, head
++ .endr
++        inner_loop  ab, tail
++        add     P_WIN_UP, P_WIN_UP, #4*4
++        sub     P_WIN_DN, P_WIN_DN, #4*4
++        vmul.f  VB0, VB0, SCALE      @ SCALE treated as scalar
++        add     P_SB, P_SB, #(512+4)*4
++        subs    I, I, #1
++        vmul.f  VA0, VA0, SCALE
++        vstmia  P_OUT_UP!, {VB0-VB3}
++        vstmdb  P_OUT_DN!, {VA0-VA3}
++        bne     1b
++
++        add     P_SB2_DN, P_SB2_DN, #(16+28-12)*4
++        sub     P_SB2_UP, P_SB2_UP, #(16+16)*4
++        add     P_WIN_DN, P_WIN_DN, #(32+16+28-12)*4
++        mov     I, #4
++1:
++        vldr.d  d4, zero             @ d4 = VC0
++        vldr.d  d5, zero
++        vldr.d  d6, zero             @ d6 = VD0
++        vldr.d  d7, zero
++ .set J, 512 - 64
++ .set OFFSET, -IMM_OFF_SKEW
++        inner_loop  cd,, head
++ .rept 7
++        inner_loop  cd, tail, head
++ .endr
++        inner_loop  cd, tail
++        add     P_WIN_UP, P_WIN_UP, #4*4
++        sub     P_WIN_DN, P_WIN_DN, #4*4
++        add     P_SB, P_SB, #(512+4)*4
++        subs    I, I, #1
++        vstmia  P_SB2_UP!, {VC0-VC3}
++        vstmdb  P_SB2_DN!, {VD0-VD3}
++        bne     1b
++
++        fmxr    FPSCR, OLDFPSCR
++        vpop    {s16-s31}
++        pop     {r3-r7,pc}
++endfunc
++
++        .unreq  IMDCT
++        .unreq  ORIG_P_SB
++        .unreq  P_SB_OFF
++        .unreq  I
++        .unreq  P_SB2_UP
++        .unreq  OLDFPSCR
++        .unreq  P_SB2_DN
++        .unreq  P_WIN_DN
++        .unreq  P_OUT_DN
++        .unreq  P_SB
++        .unreq  J_WRAP
++        .unreq  P_WIN_UP
++        .unreq  P_OUT_UP
++
++        .unreq  SCALE
++        .unreq  SBUF_DAT_REV0
++        .unreq  SBUF_DAT_REV1
++        .unreq  SBUF_DAT_REV2
++        .unreq  SBUF_DAT_REV3
++        .unreq  VA0
++        .unreq  VA3
++        .unreq  VB0
++        .unreq  VB3
++        .unreq  VC0
++        .unreq  VC3
++        .unreq  VD0
++        .unreq  VD3
++        .unreq  SBUF_DAT0
++        .unreq  SBUF_DAT1
++        .unreq  SBUF_DAT2
++        .unreq  SBUF_DAT3
++        .unreq  SBUF_DAT_ALT0
++        .unreq  SBUF_DAT_ALT1
++        .unreq  SBUF_DAT_ALT2
++        .unreq  SBUF_DAT_ALT3
++        .unreq  WIN_DN_DAT0
++        .unreq  WIN_UP_DAT0
++
++        .align  3
++zero:   .word   0, 0
+--
+1.7.9.5