diff options
Diffstat (limited to 'lib/ffmpeg/patches/0040-ffmpeg-backport-arm-Add-VFP-accelerated-version-of-s.patch')
-rw-r--r-- | lib/ffmpeg/patches/0040-ffmpeg-backport-arm-Add-VFP-accelerated-version-of-s.patch | 311 |
1 files changed, 311 insertions, 0 deletions
diff --git a/lib/ffmpeg/patches/0040-ffmpeg-backport-arm-Add-VFP-accelerated-version-of-s.patch b/lib/ffmpeg/patches/0040-ffmpeg-backport-arm-Add-VFP-accelerated-version-of-s.patch new file mode 100644 index 0000000000..d221f29a86 --- /dev/null +++ b/lib/ffmpeg/patches/0040-ffmpeg-backport-arm-Add-VFP-accelerated-version-of-s.patch @@ -0,0 +1,311 @@ +From 40daea3c1bafa9cea37b65f856c3c0432767d760 Mon Sep 17 00:00:00 2001 +From: Ben Avison <bavison@riscosopen.org> +Date: Mon, 15 Jul 2013 18:28:09 +0100 +Subject: [PATCH 39/49] [ffmpeg] - backport - arm: Add VFP-accelerated version + of synth_filter_float +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + + Before After + Mean StdDev Mean StdDev Change +This function 9295.0 114.9 4853.2 83.5 +91.5% +Overall 23699.8 397.6 19285.5 292.0 +22.9% + +Signed-off-by: Martin Storsjö <martin@martin.st> +--- + lib/ffmpeg/libavcodec/arm/Makefile | 1 + + lib/ffmpeg/libavcodec/arm/fft_init_arm.c | 8 + + lib/ffmpeg/libavcodec/arm/synth_filter_vfp.S | 243 ++++++++++++++++++++++++++ + 3 files changed, 252 insertions(+) + create mode 100644 lib/ffmpeg/libavcodec/arm/synth_filter_vfp.S + +diff --git a/lib/ffmpeg/libavcodec/arm/Makefile b/lib/ffmpeg/libavcodec/arm/Makefile +index 1c91d62..aee9d73 100644 +--- a/lib/ffmpeg/libavcodec/arm/Makefile ++++ b/lib/ffmpeg/libavcodec/arm/Makefile +@@ -58,6 +58,7 @@ ARMV6-OBJS += arm/dsputil_init_armv6.o \ + arm/dsputil_armv6.o \ + arm/simple_idct_armv6.o \ + ++VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_vfp.o + VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o + + NEON-OBJS-$(CONFIG_FFT) += arm/fft_neon.o \ +diff --git a/lib/ffmpeg/libavcodec/arm/fft_init_arm.c b/lib/ffmpeg/libavcodec/arm/fft_init_arm.c +index 8c98abc..fe0acc5 100644 +--- a/lib/ffmpeg/libavcodec/arm/fft_init_arm.c ++++ b/lib/ffmpeg/libavcodec/arm/fft_init_arm.c +@@ -32,6 +32,12 @@ void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input) + + void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z); + ++void ff_synth_filter_float_vfp(FFTContext *imdct, ++ float *synth_buf_ptr, int *synth_buf_offset, ++ float synth_buf2[32], const float window[512], ++ float out[32], const float in[32], ++ float scale); ++ + void ff_synth_filter_float_neon(FFTContext *imdct, + float *synth_buf_ptr, int *synth_buf_offset, + float synth_buf2[32], const float window[512], +@@ -71,6 +77,8 @@ av_cold void ff_synth_filter_init_arm(SynthFilterContext *s) + { + int cpu_flags = av_get_cpu_flags(); + ++ if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) ++ s->synth_filter_float = ff_synth_filter_float_vfp; + if (have_neon(cpu_flags)) + s->synth_filter_float = ff_synth_filter_float_neon; + } +diff --git a/lib/ffmpeg/libavcodec/arm/synth_filter_vfp.S b/lib/ffmpeg/libavcodec/arm/synth_filter_vfp.S +new file mode 100644 +index 0000000..c219c41 +--- /dev/null ++++ b/lib/ffmpeg/libavcodec/arm/synth_filter_vfp.S +@@ -0,0 +1,243 @@ ++/* ++ * Copyright (c) 2013 RISC OS Open Ltd ++ * Author: Ben Avison <bavison@riscosopen.org> ++ * ++ * This file is part of Libav. ++ * ++ * Libav is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * Libav is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with Libav; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/arm/asm.S" ++ ++IMDCT .req r0 ++ORIG_P_SB .req r1 ++P_SB_OFF .req r2 ++I .req r0 ++P_SB2_UP .req r1 ++OLDFPSCR .req r2 ++P_SB2_DN .req r3 ++P_WIN_DN .req r4 ++P_OUT_DN .req r5 ++P_SB .req r6 ++J_WRAP .req r7 ++P_WIN_UP .req r12 ++P_OUT_UP .req r14 ++ ++SCALE .req s0 ++SBUF_DAT_REV0 .req s4 ++SBUF_DAT_REV1 .req s5 ++SBUF_DAT_REV2 .req s6 ++SBUF_DAT_REV3 .req s7 ++VA0 .req s8 ++VA3 .req s11 ++VB0 .req s12 ++VB3 .req s15 ++VC0 .req s8 ++VC3 .req s11 ++VD0 .req s12 ++VD3 .req s15 ++SBUF_DAT0 .req s16 ++SBUF_DAT1 .req s17 ++SBUF_DAT2 .req s18 ++SBUF_DAT3 .req s19 ++SBUF_DAT_ALT0 .req s20 ++SBUF_DAT_ALT1 .req s21 ++SBUF_DAT_ALT2 .req s22 ++SBUF_DAT_ALT3 .req s23 ++WIN_DN_DAT0 .req s24 ++WIN_UP_DAT0 .req s28 ++ ++ ++.macro inner_loop half, tail, head ++ .if (OFFSET & (64*4)) == 0 @ even numbered call ++ SBUF_DAT_THIS0 .req SBUF_DAT0 ++ SBUF_DAT_THIS1 .req SBUF_DAT1 ++ SBUF_DAT_THIS2 .req SBUF_DAT2 ++ SBUF_DAT_THIS3 .req SBUF_DAT3 ++ .ifnc "\head","" ++ vldr d8, [P_SB, #OFFSET] @ d8 = SBUF_DAT ++ vldr d9, [P_SB, #OFFSET+8] ++ .endif ++ .else ++ SBUF_DAT_THIS0 .req SBUF_DAT_ALT0 ++ SBUF_DAT_THIS1 .req SBUF_DAT_ALT1 ++ SBUF_DAT_THIS2 .req SBUF_DAT_ALT2 ++ SBUF_DAT_THIS3 .req SBUF_DAT_ALT3 ++ .ifnc "\head","" ++ vldr d10, [P_SB, #OFFSET] @ d10 = SBUF_DAT_ALT ++ vldr d11, [P_SB, #OFFSET+8] ++ .endif ++ .endif ++ .ifnc "\tail","" ++ .ifc "\half","ab" ++ vmls.f VA0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors ++ .else ++ vmla.f VD0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors ++ .endif ++ .endif ++ .ifnc "\head","" ++ vldr d14, [P_WIN_UP, #OFFSET] @ d14 = WIN_UP_DAT ++ vldr d15, [P_WIN_UP, #OFFSET+8] ++ vldr d12, [P_WIN_DN, #OFFSET] @ d12 = WIN_DN_DAT ++ vldr d13, [P_WIN_DN, #OFFSET+8] ++ vmov SBUF_DAT_REV3, SBUF_DAT_THIS0 ++ vmov SBUF_DAT_REV2, SBUF_DAT_THIS1 ++ vmov SBUF_DAT_REV1, SBUF_DAT_THIS2 ++ vmov SBUF_DAT_REV0, SBUF_DAT_THIS3 ++ .ifc "\half","ab" ++ vmla.f VB0, SBUF_DAT_THIS0, WIN_UP_DAT0 ++ .else ++ vmla.f VC0, SBUF_DAT_THIS0, WIN_UP_DAT0 ++ .endif ++ teq J_WRAP, #J ++ bne 2f @ strongly predictable, so better than cond exec in this case ++ sub P_SB, P_SB, #512*4 ++2: ++ .set J, J - 64 ++ .set OFFSET, OFFSET + 64*4 ++ .endif ++ .unreq SBUF_DAT_THIS0 ++ .unreq SBUF_DAT_THIS1 ++ .unreq SBUF_DAT_THIS2 ++ .unreq SBUF_DAT_THIS3 ++.endm ++ ++ ++/* void ff_synth_filter_float_vfp(FFTContext *imdct, ++ * float *synth_buf_ptr, int *synth_buf_offset, ++ * float synth_buf2[32], const float window[512], ++ * float out[32], const float in[32], float scale) ++ */ ++function ff_synth_filter_float_vfp, export=1 ++ push {r3-r7,lr} ++ vpush {s16-s31} ++ ldr lr, [P_SB_OFF] ++ add a2, ORIG_P_SB, lr, LSL #2 @ calculate synth_buf to pass to imdct_half ++ mov P_SB, a2 @ and keep a copy for ourselves ++ bic J_WRAP, lr, #63 @ mangled to make testing for wrap easier in inner loop ++ sub lr, lr, #32 ++ and lr, lr, #512-32 ++ str lr, [P_SB_OFF] @ rotate offset, modulo buffer size, ready for next call ++ ldr a3, [sp, #(16+6+2)*4] @ fetch in from stack, to pass to imdct_half ++VFP vmov s16, SCALE @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case ++ bl ff_imdct_half_vfp ++VFP vmov SCALE, s16 ++ ++ fmrx OLDFPSCR, FPSCR ++ ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 ++ fmxr FPSCR, lr ++ ldr P_SB2_DN, [sp, #16*4] ++ ldr P_WIN_DN, [sp, #(16+6+0)*4] ++ ldr P_OUT_DN, [sp, #(16+6+1)*4] ++NOVFP vldr SCALE, [sp, #(16+6+3)*4] ++ ++#define IMM_OFF_SKEW 956 /* also valid immediate constant when you add 16*4 */ ++ add P_SB, P_SB, #IMM_OFF_SKEW @ so we can use -ve offsets to use full immediate offset range ++ add P_SB2_UP, P_SB2_DN, #16*4 ++ add P_WIN_UP, P_WIN_DN, #16*4+IMM_OFF_SKEW ++ add P_OUT_UP, P_OUT_DN, #16*4 ++ add P_SB2_DN, P_SB2_DN, #16*4 ++ add P_WIN_DN, P_WIN_DN, #12*4+IMM_OFF_SKEW ++ add P_OUT_DN, P_OUT_DN, #16*4 ++ mov I, #4 ++1: ++ vldmia P_SB2_UP!, {VB0-VB3} ++ vldmdb P_SB2_DN!, {VA0-VA3} ++ .set J, 512 - 64 ++ .set OFFSET, -IMM_OFF_SKEW ++ inner_loop ab,, head ++ .rept 7 ++ inner_loop ab, tail, head ++ .endr ++ inner_loop ab, tail ++ add P_WIN_UP, P_WIN_UP, #4*4 ++ sub P_WIN_DN, P_WIN_DN, #4*4 ++ vmul.f VB0, VB0, SCALE @ SCALE treated as scalar ++ add P_SB, P_SB, #(512+4)*4 ++ subs I, I, #1 ++ vmul.f VA0, VA0, SCALE ++ vstmia P_OUT_UP!, {VB0-VB3} ++ vstmdb P_OUT_DN!, {VA0-VA3} ++ bne 1b ++ ++ add P_SB2_DN, P_SB2_DN, #(16+28-12)*4 ++ sub P_SB2_UP, P_SB2_UP, #(16+16)*4 ++ add P_WIN_DN, P_WIN_DN, #(32+16+28-12)*4 ++ mov I, #4 ++1: ++ vldr.d d4, zero @ d4 = VC0 ++ vldr.d d5, zero ++ vldr.d d6, zero @ d6 = VD0 ++ vldr.d d7, zero ++ .set J, 512 - 64 ++ .set OFFSET, -IMM_OFF_SKEW ++ inner_loop cd,, head ++ .rept 7 ++ inner_loop cd, tail, head ++ .endr ++ inner_loop cd, tail ++ add P_WIN_UP, P_WIN_UP, #4*4 ++ sub P_WIN_DN, P_WIN_DN, #4*4 ++ add P_SB, P_SB, #(512+4)*4 ++ subs I, I, #1 ++ vstmia P_SB2_UP!, {VC0-VC3} ++ vstmdb P_SB2_DN!, {VD0-VD3} ++ bne 1b ++ ++ fmxr FPSCR, OLDFPSCR ++ vpop {s16-s31} ++ pop {r3-r7,pc} ++endfunc ++ ++ .unreq IMDCT ++ .unreq ORIG_P_SB ++ .unreq P_SB_OFF ++ .unreq I ++ .unreq P_SB2_UP ++ .unreq OLDFPSCR ++ .unreq P_SB2_DN ++ .unreq P_WIN_DN ++ .unreq P_OUT_DN ++ .unreq P_SB ++ .unreq J_WRAP ++ .unreq P_WIN_UP ++ .unreq P_OUT_UP ++ ++ .unreq SCALE ++ .unreq SBUF_DAT_REV0 ++ .unreq SBUF_DAT_REV1 ++ .unreq SBUF_DAT_REV2 ++ .unreq SBUF_DAT_REV3 ++ .unreq VA0 ++ .unreq VA3 ++ .unreq VB0 ++ .unreq VB3 ++ .unreq VC0 ++ .unreq VC3 ++ .unreq VD0 ++ .unreq VD3 ++ .unreq SBUF_DAT0 ++ .unreq SBUF_DAT1 ++ .unreq SBUF_DAT2 ++ .unreq SBUF_DAT3 ++ .unreq SBUF_DAT_ALT0 ++ .unreq SBUF_DAT_ALT1 ++ .unreq SBUF_DAT_ALT2 ++ .unreq SBUF_DAT_ALT3 ++ .unreq WIN_DN_DAT0 ++ .unreq WIN_UP_DAT0 ++ ++ .align 3 ++zero: .word 0, 0 +-- +1.7.9.5 |