diff options
Diffstat (limited to 'lib/ffmpeg')
-rw-r--r-- | lib/ffmpeg/libavcodec/arm/fft_neon.S | 73 | ||||
-rw-r--r-- | lib/ffmpeg/libavcodec/arm/h264idct_neon.S | 28 | ||||
-rw-r--r-- | lib/ffmpeg/libavcodec/arm/h264pred_neon.S | 5 | ||||
-rw-r--r-- | lib/ffmpeg/libavcodec/arm/simple_idct_neon.S | 3 | ||||
-rw-r--r-- | lib/ffmpeg/libavcodec/arm/vp3dsp_neon.S | 13 |
5 files changed, 83 insertions, 39 deletions
diff --git a/lib/ffmpeg/libavcodec/arm/fft_neon.S b/lib/ffmpeg/libavcodec/arm/fft_neon.S index 1db7abd146..6390065fc4 100644 --- a/lib/ffmpeg/libavcodec/arm/fft_neon.S +++ b/lib/ffmpeg/libavcodec/arm/fft_neon.S @@ -101,8 +101,12 @@ function fft8_neon bx lr endfunc + .align 4 +pmmp: .float +1.0, -1.0, -1.0, +1.0 +mppm: .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 + function fft16_neon - movrel r1, mppm + adr r1, mppm vld1.32 {d16-d19}, [r0,:128]! @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3} pld [r0, #32] vld1.32 {d2-d3}, [r1,:128] @@ -144,12 +148,16 @@ function fft16_neon vswp d29, d30 @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14} vadd.f32 q0, q12, q13 @ {t1,t2,t5,t6} vadd.f32 q1, q14, q15 @ {t1a,t2a,t5a,t6a} - movrel r2, X(ff_cos_16) + ldr r2, _neon_label + ldr r3, L$diff1 + add r2, r3 +local_label1: + ldr r2, [pc, r2] vsub.f32 q13, q12, q13 @ {t3,t4,t7,t8} vrev64.32 d1, d1 vsub.f32 q15, q14, q15 @ {t3a,t4a,t7a,t8a} vrev64.32 d3, d3 - movrel r3, pmmp + adr r3, pmmp vswp d1, d26 @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8} vswp d3, d30 @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a} vadd.f32 q12, q0, q13 @ {r8,i8,r9,i9} @@ -214,7 +222,7 @@ function fft_pass_neon add r2, r2, r0 @ &z[o2] add r3, r3, r0 @ &z[o3] vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]} - movrel r12, pmmp + adr r12, pmmp vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]} add r5, r5, r1 @ wim vld1.32 {d6-d7}, [r12,:128] @ pmmp @@ -279,6 +287,11 @@ function fft_pass_neon pop {r4-r6,pc} endfunc +.set L$offs16, 0 +.macro setTabOffs n, n2 +.set L$offs\n, L$offs\n2 + 4 +.endm + .macro def_fft n, n2, n4 .align 6 function fft\n\()_neon @@ -291,10 +304,15 @@ function fft\n\()_neon bl fft\n4\()_neon mov r0, r4 pop {r4, lr} - movrel r1, X(ff_cos_\n) + ldr r1, _neon_label + add r1, #L$diff\n +local_label\n: + ldr r1, [pc, r1] mov r2, #\n4/2 b fft_pass_neon endfunc +setTabOffs \n, \n2 +.set L$diff\n, _neon_label - local_label\n + L$offs\n - 8 .endm def_fft 32, 16, 8 @@ -310,10 +328,14 @@ endfunc def_fft 32768, 16384, 8192 def_fft 65536, 32768, 16384 +.set L$diffTab, fft_tab_neon_offs - local_label_tab - 8 function ff_fft_calc_neon, export=1 ldr r2, [r0] sub r2, r2, #2 - movrel r3, fft_tab_neon + ldr r3, fft_tab_neon_offs + add r3, #L$diffTab +local_label_tab: + add r3, pc ldr r3, [r3, r2, lsl #2] mov r0, r1 bx r3 @@ -349,9 +371,22 @@ function ff_fft_permute_neon, export=1 pop {r4,pc} endfunc - .section .rodata + +.global _neon_label +_neon_label: +.word _neon_cos_tab - . + +L$diff1: +.word _neon_label - local_label1 - 8 + +fft_tab_neon_offs: +.word _fft_tab_neon - . + + +.section .rodata + .align 4 -fft_tab_neon: +_fft_tab_neon: .word fft4_neon .word fft8_neon .word fft16_neon @@ -367,8 +402,20 @@ fft_tab_neon: .word fft16384_neon .word fft32768_neon .word fft65536_neon -ELF .size fft_tab_neon, . - fft_tab_neon - - .align 4 -pmmp: .float +1.0, -1.0, -1.0, +1.0 -mppm: .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 +ELF .size _fft_tab_neon, . - _fft_tab_neon + + .align 4 +_neon_cos_tab: + .word X(ff_cos_16) + .word X(ff_cos_32) + .word X(ff_cos_64) + .word X(ff_cos_128) + .word X(ff_cos_256) + .word X(ff_cos_512) + .word X(ff_cos_1024) + .word X(ff_cos_2048) + .word X(ff_cos_4096) + .word X(ff_cos_8192) + .word X(ff_cos_16384) + .word X(ff_cos_32768) + .word X(ff_cos_65536) diff --git a/lib/ffmpeg/libavcodec/arm/h264idct_neon.S b/lib/ffmpeg/libavcodec/arm/h264idct_neon.S index 6b6a669f35..8111975160 100644 --- a/lib/ffmpeg/libavcodec/arm/h264idct_neon.S +++ b/lib/ffmpeg/libavcodec/arm/h264idct_neon.S @@ -97,7 +97,7 @@ function ff_h264_idct_add16_neon, export=1 mov r1, r2 mov r2, r3 ldr r6, [sp, #24] - movrel r7, scan8 + adr r7, scan8 mov ip, #16 1: ldrb r8, [r7], #1 ldr r0, [r5], #4 @@ -117,6 +117,16 @@ function ff_h264_idct_add16_neon, export=1 pop {r4-r8,pc} endfunc + .align +scan8: .byte 4+1*8, 5+1*8, 4+2*8, 5+2*8 + .byte 6+1*8, 7+1*8, 6+2*8, 7+2*8 + .byte 4+3*8, 5+3*8, 4+4*8, 5+4*8 + .byte 6+3*8, 7+3*8, 6+4*8, 7+4*8 + .byte 1+1*8, 2+1*8 + .byte 1+2*8, 2+2*8 + .byte 1+4*8, 2+4*8 + .byte 1+5*8, 2+5*8 + function ff_h264_idct_add16intra_neon, export=1 push {r4-r8,lr} mov r4, r0 @@ -124,7 +134,7 @@ function ff_h264_idct_add16intra_neon, export=1 mov r1, r2 mov r2, r3 ldr r6, [sp, #24] - movrel r7, scan8 + adr r7, scan8 mov ip, #16 1: ldrb r8, [r7], #1 ldr r0, [r5], #4 @@ -149,7 +159,7 @@ function ff_h264_idct_add8_neon, export=1 add r1, r2, #16*32 mov r2, r3 ldr r6, [sp, #32] - movrel r7, scan8+16 + adr r7, scan8+16 mov ip, #7 1: ldrb r8, [r7], #1 ldr r0, [r5], #4 @@ -353,7 +363,7 @@ function ff_h264_idct8_add4_neon, export=1 mov r1, r2 mov r2, r3 ldr r6, [sp, #24] - movrel r7, scan8 + adr r7, scan8 mov r12, #16 1: ldrb r8, [r7], #4 ldr r0, [r5], #16 @@ -372,13 +382,3 @@ function ff_h264_idct8_add4_neon, export=1 bne 1b pop {r4-r8,pc} endfunc - - .section .rodata -scan8: .byte 4+1*8, 5+1*8, 4+2*8, 5+2*8 - .byte 6+1*8, 7+1*8, 6+2*8, 7+2*8 - .byte 4+3*8, 5+3*8, 4+4*8, 5+4*8 - .byte 6+3*8, 7+3*8, 6+4*8, 7+4*8 - .byte 1+1*8, 2+1*8 - .byte 1+2*8, 2+2*8 - .byte 1+4*8, 2+4*8 - .byte 1+5*8, 2+5*8 diff --git a/lib/ffmpeg/libavcodec/arm/h264pred_neon.S b/lib/ffmpeg/libavcodec/arm/h264pred_neon.S index 63c96ee725..357c10a3fa 100644 --- a/lib/ffmpeg/libavcodec/arm/h264pred_neon.S +++ b/lib/ffmpeg/libavcodec/arm/h264pred_neon.S @@ -123,7 +123,7 @@ function ff_pred16x16_plane_neon, export=1 vaddl.u8 q8, d2, d3 vsubl.u8 q2, d2, d0 vsubl.u8 q3, d3, d1 - movrel r3, p16weight + adr r3, p16weight vld1.8 {q0}, [r3,:128] vmul.s16 q2, q2, q0 vmul.s16 q3, q3, q0 @@ -166,7 +166,6 @@ function ff_pred16x16_plane_neon, export=1 bx lr endfunc - .section .rodata .align 4 p16weight: .short 1,2,3,4,5,6,7,8 @@ -207,7 +206,7 @@ function ff_pred8x8_plane_neon, export=1 vrev32.8 d0, d0 vtrn.32 d2, d3 vsubl.u8 q2, d2, d0 - movrel r3, p16weight + adr r3, p16weight vld1.16 {q0}, [r3,:128] vmul.s16 d4, d4, d0 vmul.s16 d5, d5, d0 diff --git a/lib/ffmpeg/libavcodec/arm/simple_idct_neon.S b/lib/ffmpeg/libavcodec/arm/simple_idct_neon.S index 17cde5835a..e61414eda7 100644 --- a/lib/ffmpeg/libavcodec/arm/simple_idct_neon.S +++ b/lib/ffmpeg/libavcodec/arm/simple_idct_neon.S @@ -239,7 +239,6 @@ function idct_col4_st8_neon bx lr endfunc - .section .rodata .align 4 idct_coeff_neon: .short W1, W2, W3, W4, W5, W6, W7, W4c @@ -249,7 +248,7 @@ idct_coeff_neon: pld [\data] pld [\data, #64] vpush {d8-d15} - movrel r3, idct_coeff_neon + adr r3, idct_coeff_neon vld1.64 {d0,d1}, [r3,:128] .endm diff --git a/lib/ffmpeg/libavcodec/arm/vp3dsp_neon.S b/lib/ffmpeg/libavcodec/arm/vp3dsp_neon.S index d97ed3d21d..74bf7ba4dc 100644 --- a/lib/ffmpeg/libavcodec/arm/vp3dsp_neon.S +++ b/lib/ffmpeg/libavcodec/arm/vp3dsp_neon.S @@ -20,12 +20,9 @@ #include "asm.S" -.section .rodata +.text .align 4 -vp3_idct_constants: -.short 64277, 60547, 54491, 46341, 36410, 25080, 12785 - #define xC1S7 d0[0] #define xC2S6 d0[1] #define xC3S5 d0[2] @@ -34,8 +31,6 @@ vp3_idct_constants: #define xC6S2 d1[1] #define xC7S1 d1[2] -.text - .macro vp3_loop_filter vsubl.u8 q3, d18, d17 vsubl.u8 q2, d16, d19 @@ -109,10 +104,14 @@ function ff_vp3_h_loop_filter_neon, export=1 bx lr endfunc +.align 4 +vp3_idct_constants: +.short 64277, 60547, 54491, 46341, 36410, 25080, 12785 +.align 4 function vp3_idct_start_neon vpush {d8-d15} - movrel r3, vp3_idct_constants + adr r3, vp3_idct_constants vld1.64 {d0-d1}, [r3,:128] vld1.64 {d16-d19}, [r2,:128]! vld1.64 {d20-d23}, [r2,:128]! |