aboutsummaryrefslogtreecommitdiff
path: root/lib/ffmpeg/libavcodec
diff options
context:
space:
mode:
authorBen Avison <bavison@riscosopen.org>2013-07-15 18:28:17 +0100
committerpopcornmix <popcornmix@gmail.com>2013-08-02 14:22:45 +0100
commite8bfcf140871363d7f4624a67c7db706e98c588c (patch)
treefe06488b8aa3d8500ec88885567961e91d28f502 /lib/ffmpeg/libavcodec
parent658e2d7939f89bdb7e178e013d3ebf1e75714230 (diff)
[ffmpeg] - backport - arm: Add VFP-accelerated version of qmf_32_subbands
Before After Mean StdDev Mean StdDev Change This function 1323.0 98.0 746.2 60.6 +77.3% Overall 15400.0 336.4 14147.5 288.4 +8.9% Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'lib/ffmpeg/libavcodec')
-rw-r--r--lib/ffmpeg/libavcodec/arm/dcadsp_init_arm.c10
-rw-r--r--lib/ffmpeg/libavcodec/arm/dcadsp_vfp.S493
2 files changed, 502 insertions, 1 deletions
diff --git a/lib/ffmpeg/libavcodec/arm/dcadsp_init_arm.c b/lib/ffmpeg/libavcodec/arm/dcadsp_init_arm.c
index a1efbfff7f..58267a2fe6 100644
--- a/lib/ffmpeg/libavcodec/arm/dcadsp_init_arm.c
+++ b/lib/ffmpeg/libavcodec/arm/dcadsp_init_arm.c
@@ -26,6 +26,12 @@
void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
int decifactor, float scale);
+void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
+ SynthFilterContext *synth, FFTContext *imdct,
+ float synth_buf_ptr[512],
+ int *synth_buf_offset, float synth_buf2[32],
+ const float window[512], float *samples_out,
+ float raXin[32], float scale);
void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs,
int decifactor, float scale);
@@ -33,8 +39,10 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
- if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags))
+ if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) {
s->lfe_fir = ff_dca_lfe_fir_vfp;
+ s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp;
+ }
if (have_neon(cpu_flags))
s->lfe_fir = ff_dca_lfe_fir_neon;
}
diff --git a/lib/ffmpeg/libavcodec/arm/dcadsp_vfp.S b/lib/ffmpeg/libavcodec/arm/dcadsp_vfp.S
new file mode 100644
index 0000000000..6039e87dfc
--- /dev/null
+++ b/lib/ffmpeg/libavcodec/arm/dcadsp_vfp.S
@@ -0,0 +1,493 @@
+/*
+ * Copyright (c) 2013 RISC OS Open Ltd
+ * Author: Ben Avison <bavison@riscosopen.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+POUT .req a1
+PIN .req a2
+PCOEF .req a3
+DECIFACTOR .req a4
+OLDFPSCR .req a4
+COUNTER .req ip
+
+SCALE32 .req s28 @ use vector of 4 in place of 9th scalar when decifactor=32 / JMAX=8
+SCALE64 .req s0 @ spare register in scalar bank when decifactor=64 / JMAX=4
+IN0 .req s4
+IN1 .req s5
+IN2 .req s6
+IN3 .req s7
+IN4 .req s0
+IN5 .req s1
+IN6 .req s2
+IN7 .req s3
+COEF0 .req s8 @ coefficient elements
+COEF1 .req s9
+COEF2 .req s10
+COEF3 .req s11
+COEF4 .req s12
+COEF5 .req s13
+COEF6 .req s14
+COEF7 .req s15
+ACCUM0 .req s16 @ double-buffered multiply-accumulate results
+ACCUM4 .req s20
+POST0 .req s24 @ do long-latency post-multiply in this vector in parallel
+POST1 .req s25
+POST2 .req s26
+POST3 .req s27
+
+
+.macro inner_loop decifactor, dir, tail, head
+ .ifc "\dir","up"
+ .set X, 0
+ .set Y, 4
+ .else
+ .set X, 4*JMAX*4 - 4
+ .set Y, -4
+ .endif
+ .ifnc "\head",""
+ vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
+ vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
+ vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
+ vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
+ .endif
+ .ifnc "\tail",""
+ vadd.f POST0, ACCUM0, ACCUM4 @ vector operation
+ .endif
+ .ifnc "\head",""
+ vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar
+ vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
+ vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
+ vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
+ .endif
+ .ifnc "\tail",""
+ vmul.f POST0, POST0, SCALE\decifactor @ vector operation (SCALE may be scalar)
+ .endif
+ .ifnc "\head",""
+ vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
+ .ifc "\tail",""
+ vmul.f ACCUM4, COEF4, IN1 @ vector operation
+ .endif
+ vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
+ vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
+ .ifnc "\tail",""
+ vmul.f ACCUM4, COEF4, IN1 @ vector operation
+ .endif
+ vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
+ vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
+ .endif
+ .ifnc "\tail",""
+ vstmia POUT!, {POST0-POST3}
+ .endif
+ .ifnc "\head",""
+ vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar
+ vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
+ vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
+ vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
+ vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
+ vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar
+ .if \decifactor == 32
+ vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
+ vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
+ vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
+ vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
+ vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar
+ vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
+ vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
+ vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
+ vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
+ vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar
+ vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
+ vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
+ vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
+ vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
+ vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar
+ vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
+ vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
+ vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
+ vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
+ vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar
+ .endif
+ .endif
+.endm
+
+.macro dca_lfe_fir decifactor
+ .if \decifactor == 32
+ .set JMAX, 8
+ vpush {s16-s31}
+ vmov SCALE32, s0 @ duplicate scalar across vector
+ vldr IN4, [PIN, #-4*4]
+ vldr IN5, [PIN, #-5*4]
+ vldr IN6, [PIN, #-6*4]
+ vldr IN7, [PIN, #-7*4]
+ .else
+ .set JMAX, 4
+ vpush {s16-s27}
+ .endif
+
+ mov COUNTER, #\decifactor/4 - 1
+ inner_loop \decifactor, up,, head
+1: add PCOEF, PCOEF, #4*JMAX*4
+ subs COUNTER, COUNTER, #1
+ inner_loop \decifactor, up, tail, head
+ bne 1b
+ inner_loop \decifactor, up, tail
+
+ mov COUNTER, #\decifactor/4 - 1
+ inner_loop \decifactor, down,, head
+1: sub PCOEF, PCOEF, #4*JMAX*4
+ subs COUNTER, COUNTER, #1
+ inner_loop \decifactor, down, tail, head
+ bne 1b
+ inner_loop \decifactor, down, tail
+
+ .if \decifactor == 32
+ vpop {s16-s31}
+ .else
+ vpop {s16-s27}
+ .endif
+ fmxr FPSCR, OLDFPSCR
+ bx lr
+.endm
+
+
+/* void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
+ * int decifactor, float scale)
+ */
+function ff_dca_lfe_fir_vfp, export=1
+ teq DECIFACTOR, #32
+ fmrx OLDFPSCR, FPSCR
+ ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
+ fmxr FPSCR, ip
+NOVFP vldr s0, [sp]
+ vldr IN0, [PIN, #-0*4]
+ vldr IN1, [PIN, #-1*4]
+ vldr IN2, [PIN, #-2*4]
+ vldr IN3, [PIN, #-3*4]
+ beq 32f
+64: dca_lfe_fir 64
+ .ltorg
+32: dca_lfe_fir 32
+endfunc
+
+ .unreq POUT
+ .unreq PIN
+ .unreq PCOEF
+ .unreq DECIFACTOR
+ .unreq OLDFPSCR
+ .unreq COUNTER
+
+ .unreq SCALE32
+ .unreq SCALE64
+ .unreq IN0
+ .unreq IN1
+ .unreq IN2
+ .unreq IN3
+ .unreq IN4
+ .unreq IN5
+ .unreq IN6
+ .unreq IN7
+ .unreq COEF0
+ .unreq COEF1
+ .unreq COEF2
+ .unreq COEF3
+ .unreq COEF4
+ .unreq COEF5
+ .unreq COEF6
+ .unreq COEF7
+ .unreq ACCUM0
+ .unreq ACCUM4
+ .unreq POST0
+ .unreq POST1
+ .unreq POST2
+ .unreq POST3
+
+
+IN .req a1
+SBACT .req a2
+OLDFPSCR .req a3
+IMDCT .req a4
+WINDOW .req v1
+OUT .req v2
+BUF .req v3
+SCALEINT .req v4 @ only used in softfp case
+COUNT .req v5
+
+SCALE .req s0
+
+/* Stack layout differs in softfp and hardfp cases:
+ *
+ * hardfp
+ * fp -> 6 arg words saved by caller
+ * a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
+ * s16-s23 on entry
+ * align 16
+ * buf -> 8*32*4 bytes buffer
+ * s0 on entry
+ * sp -> 3 arg words for callee
+ *
+ * softfp
+ * fp -> 7 arg words saved by caller
+ * a4,v1-v5,fp,lr on entry
+ * s16-s23 on entry
+ * align 16
+ * buf -> 8*32*4 bytes buffer
+ * sp -> 4 arg words for callee
+ */
+
+/* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
+ * SynthFilterContext *synth, FFTContext *imdct,
+ * float (*synth_buf_ptr)[512],
+ * int *synth_buf_offset, float (*synth_buf2)[32],
+ * const float (*window)[512], float *samples_out,
+ * float (*raXin)[32], float scale);
+ */
+function ff_dca_qmf_32_subbands_vfp, export=1
+VFP push {a3-a4,v1-v3,v5,fp,lr}
+NOVFP push {a4,v1-v5,fp,lr}
+ add fp, sp, #8*4
+ vpush {s16-s23}
+ @ The buffer pointed at by raXin isn't big enough for us to do a
+ @ complete matrix transposition as we want to, so allocate an
+ @ alternative buffer from the stack. Align to 4 words for speed.
+ sub BUF, sp, #8*32*4
+ bic BUF, BUF, #15
+ mov sp, BUF
+ ldr lr, =0x03330000 @ RunFast mode, short vectors of length 4, stride 2
+ fmrx OLDFPSCR, FPSCR
+ fmxr FPSCR, lr
+ @ COUNT is used to count down 2 things at once:
+ @ bits 0-4 are the number of word pairs remaining in the output row
+ @ bits 5-31 are the number of words to copy (with possible negation)
+ @ from the source matrix before we start zeroing the remainder
+ mov COUNT, #(-4 << 5) + 16
+ adds COUNT, COUNT, SBACT, lsl #5
+ bmi 2f
+1:
+ vldr s8, [IN, #(0*8+0)*4]
+ vldr s10, [IN, #(0*8+1)*4]
+ vldr s12, [IN, #(0*8+2)*4]
+ vldr s14, [IN, #(0*8+3)*4]
+ vldr s16, [IN, #(0*8+4)*4]
+ vldr s18, [IN, #(0*8+5)*4]
+ vldr s20, [IN, #(0*8+6)*4]
+ vldr s22, [IN, #(0*8+7)*4]
+ vneg.f s8, s8
+ vldr s9, [IN, #(1*8+0)*4]
+ vldr s11, [IN, #(1*8+1)*4]
+ vldr s13, [IN, #(1*8+2)*4]
+ vldr s15, [IN, #(1*8+3)*4]
+ vneg.f s16, s16
+ vldr s17, [IN, #(1*8+4)*4]
+ vldr s19, [IN, #(1*8+5)*4]
+ vldr s21, [IN, #(1*8+6)*4]
+ vldr s23, [IN, #(1*8+7)*4]
+ vstr d4, [BUF, #(0*32+0)*4]
+ vstr d5, [BUF, #(1*32+0)*4]
+ vstr d6, [BUF, #(2*32+0)*4]
+ vstr d7, [BUF, #(3*32+0)*4]
+ vstr d8, [BUF, #(4*32+0)*4]
+ vstr d9, [BUF, #(5*32+0)*4]
+ vstr d10, [BUF, #(6*32+0)*4]
+ vstr d11, [BUF, #(7*32+0)*4]
+ vldr s9, [IN, #(3*8+0)*4]
+ vldr s11, [IN, #(3*8+1)*4]
+ vldr s13, [IN, #(3*8+2)*4]
+ vldr s15, [IN, #(3*8+3)*4]
+ vldr s17, [IN, #(3*8+4)*4]
+ vldr s19, [IN, #(3*8+5)*4]
+ vldr s21, [IN, #(3*8+6)*4]
+ vldr s23, [IN, #(3*8+7)*4]
+ vneg.f s9, s9
+ vldr s8, [IN, #(2*8+0)*4]
+ vldr s10, [IN, #(2*8+1)*4]
+ vldr s12, [IN, #(2*8+2)*4]
+ vldr s14, [IN, #(2*8+3)*4]
+ vneg.f s17, s17
+ vldr s16, [IN, #(2*8+4)*4]
+ vldr s18, [IN, #(2*8+5)*4]
+ vldr s20, [IN, #(2*8+6)*4]
+ vldr s22, [IN, #(2*8+7)*4]
+ vstr d4, [BUF, #(0*32+2)*4]
+ vstr d5, [BUF, #(1*32+2)*4]
+ vstr d6, [BUF, #(2*32+2)*4]
+ vstr d7, [BUF, #(3*32+2)*4]
+ vstr d8, [BUF, #(4*32+2)*4]
+ vstr d9, [BUF, #(5*32+2)*4]
+ vstr d10, [BUF, #(6*32+2)*4]
+ vstr d11, [BUF, #(7*32+2)*4]
+ add IN, IN, #4*8*4
+ add BUF, BUF, #4*4
+ subs COUNT, COUNT, #(4 << 5) + 2
+ bpl 1b
+2: @ Now deal with trailing < 4 samples
+ adds COUNT, COUNT, #3 << 5
+ bmi 4f @ sb_act was a multiple of 4
+ bics lr, COUNT, #0x1F
+ bne 3f
+ @ sb_act was n*4+1
+ vldr s8, [IN, #(0*8+0)*4]
+ vldr s10, [IN, #(0*8+1)*4]
+ vldr s12, [IN, #(0*8+2)*4]
+ vldr s14, [IN, #(0*8+3)*4]
+ vldr s16, [IN, #(0*8+4)*4]
+ vldr s18, [IN, #(0*8+5)*4]
+ vldr s20, [IN, #(0*8+6)*4]
+ vldr s22, [IN, #(0*8+7)*4]
+ vneg.f s8, s8
+ vldr s9, zero
+ vldr s11, zero
+ vldr s13, zero
+ vldr s15, zero
+ vneg.f s16, s16
+ vldr s17, zero
+ vldr s19, zero
+ vldr s21, zero
+ vldr s23, zero
+ vstr d4, [BUF, #(0*32+0)*4]
+ vstr d5, [BUF, #(1*32+0)*4]
+ vstr d6, [BUF, #(2*32+0)*4]
+ vstr d7, [BUF, #(3*32+0)*4]
+ vstr d8, [BUF, #(4*32+0)*4]
+ vstr d9, [BUF, #(5*32+0)*4]
+ vstr d10, [BUF, #(6*32+0)*4]
+ vstr d11, [BUF, #(7*32+0)*4]
+ add BUF, BUF, #2*4
+ sub COUNT, COUNT, #1
+ b 4f
+3: @ sb_act was n*4+2 or n*4+3, so do the first 2
+ vldr s8, [IN, #(0*8+0)*4]
+ vldr s10, [IN, #(0*8+1)*4]
+ vldr s12, [IN, #(0*8+2)*4]
+ vldr s14, [IN, #(0*8+3)*4]
+ vldr s16, [IN, #(0*8+4)*4]
+ vldr s18, [IN, #(0*8+5)*4]
+ vldr s20, [IN, #(0*8+6)*4]
+ vldr s22, [IN, #(0*8+7)*4]
+ vneg.f s8, s8
+ vldr s9, [IN, #(1*8+0)*4]
+ vldr s11, [IN, #(1*8+1)*4]
+ vldr s13, [IN, #(1*8+2)*4]
+ vldr s15, [IN, #(1*8+3)*4]
+ vneg.f s16, s16
+ vldr s17, [IN, #(1*8+4)*4]
+ vldr s19, [IN, #(1*8+5)*4]
+ vldr s21, [IN, #(1*8+6)*4]
+ vldr s23, [IN, #(1*8+7)*4]
+ vstr d4, [BUF, #(0*32+0)*4]
+ vstr d5, [BUF, #(1*32+0)*4]
+ vstr d6, [BUF, #(2*32+0)*4]
+ vstr d7, [BUF, #(3*32+0)*4]
+ vstr d8, [BUF, #(4*32+0)*4]
+ vstr d9, [BUF, #(5*32+0)*4]
+ vstr d10, [BUF, #(6*32+0)*4]
+ vstr d11, [BUF, #(7*32+0)*4]
+ add BUF, BUF, #2*4
+ sub COUNT, COUNT, #(2 << 5) + 1
+ bics lr, COUNT, #0x1F
+ bne 4f
+ @ sb_act was n*4+3
+ vldr s8, [IN, #(2*8+0)*4]
+ vldr s10, [IN, #(2*8+1)*4]
+ vldr s12, [IN, #(2*8+2)*4]
+ vldr s14, [IN, #(2*8+3)*4]
+ vldr s16, [IN, #(2*8+4)*4]
+ vldr s18, [IN, #(2*8+5)*4]
+ vldr s20, [IN, #(2*8+6)*4]
+ vldr s22, [IN, #(2*8+7)*4]
+ vldr s9, zero
+ vldr s11, zero
+ vldr s13, zero
+ vldr s15, zero
+ vldr s17, zero
+ vldr s19, zero
+ vldr s21, zero
+ vldr s23, zero
+ vstr d4, [BUF, #(0*32+0)*4]
+ vstr d5, [BUF, #(1*32+0)*4]
+ vstr d6, [BUF, #(2*32+0)*4]
+ vstr d7, [BUF, #(3*32+0)*4]
+ vstr d8, [BUF, #(4*32+0)*4]
+ vstr d9, [BUF, #(5*32+0)*4]
+ vstr d10, [BUF, #(6*32+0)*4]
+ vstr d11, [BUF, #(7*32+0)*4]
+ add BUF, BUF, #2*4
+ sub COUNT, COUNT, #1
+4: @ Now fill the remainder with 0
+ vldr s8, zero
+ vldr s9, zero
+ ands COUNT, COUNT, #0x1F
+ beq 6f
+5: vstr d4, [BUF, #(0*32+0)*4]
+ vstr d4, [BUF, #(1*32+0)*4]
+ vstr d4, [BUF, #(2*32+0)*4]
+ vstr d4, [BUF, #(3*32+0)*4]
+ vstr d4, [BUF, #(4*32+0)*4]
+ vstr d4, [BUF, #(5*32+0)*4]
+ vstr d4, [BUF, #(6*32+0)*4]
+ vstr d4, [BUF, #(7*32+0)*4]
+ add BUF, BUF, #2*4
+ subs COUNT, COUNT, #1
+ bne 5b
+6:
+ fmxr FPSCR, OLDFPSCR
+ ldr WINDOW, [fp, #3*4]
+ ldr OUT, [fp, #4*4]
+ sub BUF, BUF, #32*4
+NOVFP ldr SCALEINT, [fp, #6*4]
+ mov COUNT, #8
+VFP vpush {SCALE}
+VFP sub sp, sp, #3*4
+NOVFP sub sp, sp, #4*4
+7:
+VFP ldr a1, [fp, #-7*4] @ imdct
+NOVFP ldr a1, [fp, #-8*4]
+ ldmia fp, {a2-a4}
+VFP stmia sp, {WINDOW, OUT, BUF}
+NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT}
+VFP vldr SCALE, [sp, #3*4]
+ bl ff_synth_filter_float_vfp
+ add OUT, OUT, #32*4
+ add BUF, BUF, #32*4
+ subs COUNT, COUNT, #1
+ bne 7b
+
+A sub sp, fp, #(8+8)*4
+T sub fp, fp, #(8+8)*4
+T mov sp, fp
+ vpop {s16-s23}
+VFP pop {a3-a4,v1-v3,v5,fp,pc}
+NOVFP pop {a4,v1-v5,fp,pc}
+endfunc
+
+ .unreq IN
+ .unreq SBACT
+ .unreq OLDFPSCR
+ .unreq IMDCT
+ .unreq WINDOW
+ .unreq OUT
+ .unreq BUF
+ .unreq SCALEINT
+ .unreq COUNT
+
+ .unreq SCALE
+
+ .align 2
+zero: .word 0