diff options
author | Ben Avison <bavison@riscosopen.org> | 2013-07-15 18:28:17 +0100 |
---|---|---|
committer | popcornmix <popcornmix@gmail.com> | 2013-08-02 14:22:45 +0100 |
commit | e8bfcf140871363d7f4624a67c7db706e98c588c (patch) | |
tree | fe06488b8aa3d8500ec88885567961e91d28f502 /lib/ffmpeg/libavcodec | |
parent | 658e2d7939f89bdb7e178e013d3ebf1e75714230 (diff) |
[ffmpeg] - backport - arm: Add VFP-accelerated version of qmf_32_subbands
Before After
Mean StdDev Mean StdDev Change
This function 1323.0 98.0 746.2 60.6 +77.3%
Overall 15400.0 336.4 14147.5 288.4 +8.9%
Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'lib/ffmpeg/libavcodec')
-rw-r--r-- | lib/ffmpeg/libavcodec/arm/dcadsp_init_arm.c | 10 | ||||
-rw-r--r-- | lib/ffmpeg/libavcodec/arm/dcadsp_vfp.S | 493 |
2 files changed, 502 insertions, 1 deletions
diff --git a/lib/ffmpeg/libavcodec/arm/dcadsp_init_arm.c b/lib/ffmpeg/libavcodec/arm/dcadsp_init_arm.c index a1efbfff7f..58267a2fe6 100644 --- a/lib/ffmpeg/libavcodec/arm/dcadsp_init_arm.c +++ b/lib/ffmpeg/libavcodec/arm/dcadsp_init_arm.c @@ -26,6 +26,12 @@ void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs, int decifactor, float scale); +void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act, + SynthFilterContext *synth, FFTContext *imdct, + float synth_buf_ptr[512], + int *synth_buf_offset, float synth_buf2[32], + const float window[512], float *samples_out, + float raXin[32], float scale); void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs, int decifactor, float scale); @@ -33,8 +39,10 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s) { int cpu_flags = av_get_cpu_flags(); - if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) + if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) { s->lfe_fir = ff_dca_lfe_fir_vfp; + s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp; + } if (have_neon(cpu_flags)) s->lfe_fir = ff_dca_lfe_fir_neon; } diff --git a/lib/ffmpeg/libavcodec/arm/dcadsp_vfp.S b/lib/ffmpeg/libavcodec/arm/dcadsp_vfp.S new file mode 100644 index 0000000000..6039e87dfc --- /dev/null +++ b/lib/ffmpeg/libavcodec/arm/dcadsp_vfp.S @@ -0,0 +1,493 @@ +/* + * Copyright (c) 2013 RISC OS Open Ltd + * Author: Ben Avison <bavison@riscosopen.org> + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +POUT .req a1 +PIN .req a2 +PCOEF .req a3 +DECIFACTOR .req a4 +OLDFPSCR .req a4 +COUNTER .req ip + +SCALE32 .req s28 @ use vector of 4 in place of 9th scalar when decifactor=32 / JMAX=8 +SCALE64 .req s0 @ spare register in scalar bank when decifactor=64 / JMAX=4 +IN0 .req s4 +IN1 .req s5 +IN2 .req s6 +IN3 .req s7 +IN4 .req s0 +IN5 .req s1 +IN6 .req s2 +IN7 .req s3 +COEF0 .req s8 @ coefficient elements +COEF1 .req s9 +COEF2 .req s10 +COEF3 .req s11 +COEF4 .req s12 +COEF5 .req s13 +COEF6 .req s14 +COEF7 .req s15 +ACCUM0 .req s16 @ double-buffered multiply-accumulate results +ACCUM4 .req s20 +POST0 .req s24 @ do long-latency post-multiply in this vector in parallel +POST1 .req s25 +POST2 .req s26 +POST3 .req s27 + + +.macro inner_loop decifactor, dir, tail, head + .ifc "\dir","up" + .set X, 0 + .set Y, 4 + .else + .set X, 4*JMAX*4 - 4 + .set Y, -4 + .endif + .ifnc "\head","" + vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y] + vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y] + vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y] + vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y] + .endif + .ifnc "\tail","" + vadd.f POST0, ACCUM0, ACCUM4 @ vector operation + .endif + .ifnc "\head","" + vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar + vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y] + vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y] + vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y] + .endif + .ifnc "\tail","" + vmul.f POST0, POST0, SCALE\decifactor @ vector operation (SCALE may be scalar) + .endif + .ifnc "\head","" + vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y] + .ifc "\tail","" + vmul.f ACCUM4, COEF4, IN1 @ vector operation + .endif + vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y] + vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y] + .ifnc "\tail","" + vmul.f ACCUM4, COEF4, IN1 @ vector operation + .endif + vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y] + vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y] + .endif + .ifnc "\tail","" + vstmia POUT!, {POST0-POST3} + .endif + .ifnc "\head","" + vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar + vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y] + vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y] + vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y] + vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y] + vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar + .if \decifactor == 32 + vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y] + vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y] + vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y] + vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y] + vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar + vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y] + vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y] + vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y] + vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y] + vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar + vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y] + vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y] + vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y] + vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y] + vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar + vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y] + vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y] + vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y] + vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y] + vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar + .endif + .endif +.endm + +.macro dca_lfe_fir decifactor + .if \decifactor == 32 + .set JMAX, 8 + vpush {s16-s31} + vmov SCALE32, s0 @ duplicate scalar across vector + vldr IN4, [PIN, #-4*4] + vldr IN5, [PIN, #-5*4] + vldr IN6, [PIN, #-6*4] + vldr IN7, [PIN, #-7*4] + .else + .set JMAX, 4 + vpush {s16-s27} + .endif + + mov COUNTER, #\decifactor/4 - 1 + inner_loop \decifactor, up,, head +1: add PCOEF, PCOEF, #4*JMAX*4 + subs COUNTER, COUNTER, #1 + inner_loop \decifactor, up, tail, head + bne 1b + inner_loop \decifactor, up, tail + + mov COUNTER, #\decifactor/4 - 1 + inner_loop \decifactor, down,, head +1: sub PCOEF, PCOEF, #4*JMAX*4 + subs COUNTER, COUNTER, #1 + inner_loop \decifactor, down, tail, head + bne 1b + inner_loop \decifactor, down, tail + + .if \decifactor == 32 + vpop {s16-s31} + .else + vpop {s16-s27} + .endif + fmxr FPSCR, OLDFPSCR + bx lr +.endm + + +/* void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs, + * int decifactor, float scale) + */ +function ff_dca_lfe_fir_vfp, export=1 + teq DECIFACTOR, #32 + fmrx OLDFPSCR, FPSCR + ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 + fmxr FPSCR, ip +NOVFP vldr s0, [sp] + vldr IN0, [PIN, #-0*4] + vldr IN1, [PIN, #-1*4] + vldr IN2, [PIN, #-2*4] + vldr IN3, [PIN, #-3*4] + beq 32f +64: dca_lfe_fir 64 + .ltorg +32: dca_lfe_fir 32 +endfunc + + .unreq POUT + .unreq PIN + .unreq PCOEF + .unreq DECIFACTOR + .unreq OLDFPSCR + .unreq COUNTER + + .unreq SCALE32 + .unreq SCALE64 + .unreq IN0 + .unreq IN1 + .unreq IN2 + .unreq IN3 + .unreq IN4 + .unreq IN5 + .unreq IN6 + .unreq IN7 + .unreq COEF0 + .unreq COEF1 + .unreq COEF2 + .unreq COEF3 + .unreq COEF4 + .unreq COEF5 + .unreq COEF6 + .unreq COEF7 + .unreq ACCUM0 + .unreq ACCUM4 + .unreq POST0 + .unreq POST1 + .unreq POST2 + .unreq POST3 + + +IN .req a1 +SBACT .req a2 +OLDFPSCR .req a3 +IMDCT .req a4 +WINDOW .req v1 +OUT .req v2 +BUF .req v3 +SCALEINT .req v4 @ only used in softfp case +COUNT .req v5 + +SCALE .req s0 + +/* Stack layout differs in softfp and hardfp cases: + * + * hardfp + * fp -> 6 arg words saved by caller + * a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes) + * s16-s23 on entry + * align 16 + * buf -> 8*32*4 bytes buffer + * s0 on entry + * sp -> 3 arg words for callee + * + * softfp + * fp -> 7 arg words saved by caller + * a4,v1-v5,fp,lr on entry + * s16-s23 on entry + * align 16 + * buf -> 8*32*4 bytes buffer + * sp -> 4 arg words for callee + */ + +/* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act, + * SynthFilterContext *synth, FFTContext *imdct, + * float (*synth_buf_ptr)[512], + * int *synth_buf_offset, float (*synth_buf2)[32], + * const float (*window)[512], float *samples_out, + * float (*raXin)[32], float scale); + */ +function ff_dca_qmf_32_subbands_vfp, export=1 +VFP push {a3-a4,v1-v3,v5,fp,lr} +NOVFP push {a4,v1-v5,fp,lr} + add fp, sp, #8*4 + vpush {s16-s23} + @ The buffer pointed at by raXin isn't big enough for us to do a + @ complete matrix transposition as we want to, so allocate an + @ alternative buffer from the stack. Align to 4 words for speed. + sub BUF, sp, #8*32*4 + bic BUF, BUF, #15 + mov sp, BUF + ldr lr, =0x03330000 @ RunFast mode, short vectors of length 4, stride 2 + fmrx OLDFPSCR, FPSCR + fmxr FPSCR, lr + @ COUNT is used to count down 2 things at once: + @ bits 0-4 are the number of word pairs remaining in the output row + @ bits 5-31 are the number of words to copy (with possible negation) + @ from the source matrix before we start zeroing the remainder + mov COUNT, #(-4 << 5) + 16 + adds COUNT, COUNT, SBACT, lsl #5 + bmi 2f +1: + vldr s8, [IN, #(0*8+0)*4] + vldr s10, [IN, #(0*8+1)*4] + vldr s12, [IN, #(0*8+2)*4] + vldr s14, [IN, #(0*8+3)*4] + vldr s16, [IN, #(0*8+4)*4] + vldr s18, [IN, #(0*8+5)*4] + vldr s20, [IN, #(0*8+6)*4] + vldr s22, [IN, #(0*8+7)*4] + vneg.f s8, s8 + vldr s9, [IN, #(1*8+0)*4] + vldr s11, [IN, #(1*8+1)*4] + vldr s13, [IN, #(1*8+2)*4] + vldr s15, [IN, #(1*8+3)*4] + vneg.f s16, s16 + vldr s17, [IN, #(1*8+4)*4] + vldr s19, [IN, #(1*8+5)*4] + vldr s21, [IN, #(1*8+6)*4] + vldr s23, [IN, #(1*8+7)*4] + vstr d4, [BUF, #(0*32+0)*4] + vstr d5, [BUF, #(1*32+0)*4] + vstr d6, [BUF, #(2*32+0)*4] + vstr d7, [BUF, #(3*32+0)*4] + vstr d8, [BUF, #(4*32+0)*4] + vstr d9, [BUF, #(5*32+0)*4] + vstr d10, [BUF, #(6*32+0)*4] + vstr d11, [BUF, #(7*32+0)*4] + vldr s9, [IN, #(3*8+0)*4] + vldr s11, [IN, #(3*8+1)*4] + vldr s13, [IN, #(3*8+2)*4] + vldr s15, [IN, #(3*8+3)*4] + vldr s17, [IN, #(3*8+4)*4] + vldr s19, [IN, #(3*8+5)*4] + vldr s21, [IN, #(3*8+6)*4] + vldr s23, [IN, #(3*8+7)*4] + vneg.f s9, s9 + vldr s8, [IN, #(2*8+0)*4] + vldr s10, [IN, #(2*8+1)*4] + vldr s12, [IN, #(2*8+2)*4] + vldr s14, [IN, #(2*8+3)*4] + vneg.f s17, s17 + vldr s16, [IN, #(2*8+4)*4] + vldr s18, [IN, #(2*8+5)*4] + vldr s20, [IN, #(2*8+6)*4] + vldr s22, [IN, #(2*8+7)*4] + vstr d4, [BUF, #(0*32+2)*4] + vstr d5, [BUF, #(1*32+2)*4] + vstr d6, [BUF, #(2*32+2)*4] + vstr d7, [BUF, #(3*32+2)*4] + vstr d8, [BUF, #(4*32+2)*4] + vstr d9, [BUF, #(5*32+2)*4] + vstr d10, [BUF, #(6*32+2)*4] + vstr d11, [BUF, #(7*32+2)*4] + add IN, IN, #4*8*4 + add BUF, BUF, #4*4 + subs COUNT, COUNT, #(4 << 5) + 2 + bpl 1b +2: @ Now deal with trailing < 4 samples + adds COUNT, COUNT, #3 << 5 + bmi 4f @ sb_act was a multiple of 4 + bics lr, COUNT, #0x1F + bne 3f + @ sb_act was n*4+1 + vldr s8, [IN, #(0*8+0)*4] + vldr s10, [IN, #(0*8+1)*4] + vldr s12, [IN, #(0*8+2)*4] + vldr s14, [IN, #(0*8+3)*4] + vldr s16, [IN, #(0*8+4)*4] + vldr s18, [IN, #(0*8+5)*4] + vldr s20, [IN, #(0*8+6)*4] + vldr s22, [IN, #(0*8+7)*4] + vneg.f s8, s8 + vldr s9, zero + vldr s11, zero + vldr s13, zero + vldr s15, zero + vneg.f s16, s16 + vldr s17, zero + vldr s19, zero + vldr s21, zero + vldr s23, zero + vstr d4, [BUF, #(0*32+0)*4] + vstr d5, [BUF, #(1*32+0)*4] + vstr d6, [BUF, #(2*32+0)*4] + vstr d7, [BUF, #(3*32+0)*4] + vstr d8, [BUF, #(4*32+0)*4] + vstr d9, [BUF, #(5*32+0)*4] + vstr d10, [BUF, #(6*32+0)*4] + vstr d11, [BUF, #(7*32+0)*4] + add BUF, BUF, #2*4 + sub COUNT, COUNT, #1 + b 4f +3: @ sb_act was n*4+2 or n*4+3, so do the first 2 + vldr s8, [IN, #(0*8+0)*4] + vldr s10, [IN, #(0*8+1)*4] + vldr s12, [IN, #(0*8+2)*4] + vldr s14, [IN, #(0*8+3)*4] + vldr s16, [IN, #(0*8+4)*4] + vldr s18, [IN, #(0*8+5)*4] + vldr s20, [IN, #(0*8+6)*4] + vldr s22, [IN, #(0*8+7)*4] + vneg.f s8, s8 + vldr s9, [IN, #(1*8+0)*4] + vldr s11, [IN, #(1*8+1)*4] + vldr s13, [IN, #(1*8+2)*4] + vldr s15, [IN, #(1*8+3)*4] + vneg.f s16, s16 + vldr s17, [IN, #(1*8+4)*4] + vldr s19, [IN, #(1*8+5)*4] + vldr s21, [IN, #(1*8+6)*4] + vldr s23, [IN, #(1*8+7)*4] + vstr d4, [BUF, #(0*32+0)*4] + vstr d5, [BUF, #(1*32+0)*4] + vstr d6, [BUF, #(2*32+0)*4] + vstr d7, [BUF, #(3*32+0)*4] + vstr d8, [BUF, #(4*32+0)*4] + vstr d9, [BUF, #(5*32+0)*4] + vstr d10, [BUF, #(6*32+0)*4] + vstr d11, [BUF, #(7*32+0)*4] + add BUF, BUF, #2*4 + sub COUNT, COUNT, #(2 << 5) + 1 + bics lr, COUNT, #0x1F + bne 4f + @ sb_act was n*4+3 + vldr s8, [IN, #(2*8+0)*4] + vldr s10, [IN, #(2*8+1)*4] + vldr s12, [IN, #(2*8+2)*4] + vldr s14, [IN, #(2*8+3)*4] + vldr s16, [IN, #(2*8+4)*4] + vldr s18, [IN, #(2*8+5)*4] + vldr s20, [IN, #(2*8+6)*4] + vldr s22, [IN, #(2*8+7)*4] + vldr s9, zero + vldr s11, zero + vldr s13, zero + vldr s15, zero + vldr s17, zero + vldr s19, zero + vldr s21, zero + vldr s23, zero + vstr d4, [BUF, #(0*32+0)*4] + vstr d5, [BUF, #(1*32+0)*4] + vstr d6, [BUF, #(2*32+0)*4] + vstr d7, [BUF, #(3*32+0)*4] + vstr d8, [BUF, #(4*32+0)*4] + vstr d9, [BUF, #(5*32+0)*4] + vstr d10, [BUF, #(6*32+0)*4] + vstr d11, [BUF, #(7*32+0)*4] + add BUF, BUF, #2*4 + sub COUNT, COUNT, #1 +4: @ Now fill the remainder with 0 + vldr s8, zero + vldr s9, zero + ands COUNT, COUNT, #0x1F + beq 6f +5: vstr d4, [BUF, #(0*32+0)*4] + vstr d4, [BUF, #(1*32+0)*4] + vstr d4, [BUF, #(2*32+0)*4] + vstr d4, [BUF, #(3*32+0)*4] + vstr d4, [BUF, #(4*32+0)*4] + vstr d4, [BUF, #(5*32+0)*4] + vstr d4, [BUF, #(6*32+0)*4] + vstr d4, [BUF, #(7*32+0)*4] + add BUF, BUF, #2*4 + subs COUNT, COUNT, #1 + bne 5b +6: + fmxr FPSCR, OLDFPSCR + ldr WINDOW, [fp, #3*4] + ldr OUT, [fp, #4*4] + sub BUF, BUF, #32*4 +NOVFP ldr SCALEINT, [fp, #6*4] + mov COUNT, #8 +VFP vpush {SCALE} +VFP sub sp, sp, #3*4 +NOVFP sub sp, sp, #4*4 +7: +VFP ldr a1, [fp, #-7*4] @ imdct +NOVFP ldr a1, [fp, #-8*4] + ldmia fp, {a2-a4} +VFP stmia sp, {WINDOW, OUT, BUF} +NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT} +VFP vldr SCALE, [sp, #3*4] + bl ff_synth_filter_float_vfp + add OUT, OUT, #32*4 + add BUF, BUF, #32*4 + subs COUNT, COUNT, #1 + bne 7b + +A sub sp, fp, #(8+8)*4 +T sub fp, fp, #(8+8)*4 +T mov sp, fp + vpop {s16-s23} +VFP pop {a3-a4,v1-v3,v5,fp,pc} +NOVFP pop {a4,v1-v5,fp,pc} +endfunc + + .unreq IN + .unreq SBACT + .unreq OLDFPSCR + .unreq IMDCT + .unreq WINDOW + .unreq OUT + .unreq BUF + .unreq SCALEINT + .unreq COUNT + + .unreq SCALE + + .align 2 +zero: .word 0 |