diff options
author | Memphiz <memphis@machzwo.de> | 2012-10-20 14:15:28 +0200 |
---|---|---|
committer | Memphiz <memphis@machzwo.de> | 2012-10-20 14:20:29 +0200 |
commit | 7e89d79507500370d7283ad42e1d479e2e74dc1c (patch) | |
tree | 6eccda0425c808e4c3017374fe64c4a8ebeee3ef /lib | |
parent | f2d217d7751195630f85702430df0a52e6ab886a (diff) |
[ffmpeg] - patch file for "ARM: fix overreads in neon h264 chroma mc"
Diffstat (limited to 'lib')
-rw-r--r-- | lib/ffmpeg/patches/0034-ARM-fix-h264-chroma-mc-overrides.patch | 257 |
1 files changed, 257 insertions, 0 deletions
diff --git a/lib/ffmpeg/patches/0034-ARM-fix-h264-chroma-mc-overrides.patch b/lib/ffmpeg/patches/0034-ARM-fix-h264-chroma-mc-overrides.patch new file mode 100644 index 0000000000..8b13712f3e --- /dev/null +++ b/lib/ffmpeg/patches/0034-ARM-fix-h264-chroma-mc-overrides.patch @@ -0,0 +1,257 @@ +From 1846ddf0a7870e6862905121d4e9da1c283d6ff6 Mon Sep 17 00:00:00 2001 +From: Mans Rullgard <mans@mansr.com> +Date: Fri, 19 Oct 2012 13:39:11 +0100 +Subject: [PATCH] ARM: fix overreads in neon h264 chroma mc + +The loops were reading ahead one line, which could end up outside the +buffer for reference blocks at the edge of the picture. Removing +this readahead has no measurable performance impact. + +Signed-off-by: Mans Rullgard <mans@mansr.com> +--- + libavcodec/arm/h264cmc_neon.S | 86 +++++++++++++--------------------------- + 1 files changed, 28 insertions(+), 58 deletions(-) + +diff --git a/libavcodec/arm/h264cmc_neon.S b/libavcodec/arm/h264cmc_neon.S +index c7e5460..3427e36 100644 +--- a/libavcodec/arm/h264cmc_neon.S ++++ b/libavcodec/arm/h264cmc_neon.S +@@ -51,24 +51,20 @@ T cmp r7, #0 + + beq 2f + +- add r5, r1, r2 +- + vdup.8 d0, r4 +- lsl r4, r2, #1 + vdup.8 d1, r12 +- vld1.8 {d4, d5}, [r1], r4 ++ vld1.8 {d4, d5}, [r1], r2 + vdup.8 d2, r6 +- vld1.8 {d6, d7}, [r5], r4 + vdup.8 d3, r7 +- + vext.8 d5, d4, d5, #1 +- vext.8 d7, d6, d7, #1 + +-1: pld [r5] ++1: vld1.8 {d6, d7}, [r1], r2 + vmull.u8 q8, d4, d0 + vmlal.u8 q8, d5, d1 +- vld1.8 {d4, d5}, [r1], r4 ++ vext.8 d7, d6, d7, #1 ++ vld1.8 {d4, d5}, [r1], r2 + vmlal.u8 q8, d6, d2 ++ pld [r1] + vext.8 d5, d4, d5, #1 + vmlal.u8 q8, d7, d3 + vmull.u8 q9, d6, d0 +@@ -76,8 +72,7 @@ T cmp r7, #0 + vmlal.u8 q9, d7, d1 + vmlal.u8 q9, d4, d2 + vmlal.u8 q9, d5, d3 +- vld1.8 {d6, d7}, [r5], r4 +- pld [r1] ++ pld [r1, r2] + .ifc \codec,h264 + vrshrn.u16 d16, q8, #6 + vrshrn.u16 d17, q9, #6 +@@ -92,7 +87,6 @@ T cmp r7, #0 + vld1.8 {d21}, [lr,:64], r2 + vrhadd.u8 q8, q8, q10 + .endif +- vext.8 d7, d6, d7, #1 + vst1.8 {d16}, [r0,:64], r2 + vst1.8 {d17}, [r0,:64], r2 + bgt 1b +@@ -106,18 +100,15 @@ T cmp r7, #0 + + beq 4f + +- add r5, r1, r2 +- lsl r4, r2, #1 +- vld1.8 {d4}, [r1], r4 +- vld1.8 {d6}, [r5], r4 ++ vld1.8 {d4}, [r1], r2 + +-3: pld [r5] ++3: vld1.8 {d6}, [r1], r2 + vmull.u8 q8, d4, d0 + vmlal.u8 q8, d6, d1 +- vld1.8 {d4}, [r1], r4 ++ vld1.8 {d4}, [r1], r2 + vmull.u8 q9, d6, d0 + vmlal.u8 q9, d4, d1 +- vld1.8 {d6}, [r5], r4 ++ pld [r1] + .ifc \codec,h264 + vrshrn.u16 d16, q8, #6 + vrshrn.u16 d17, q9, #6 +@@ -127,13 +118,13 @@ T cmp r7, #0 + vshrn.u16 d16, q8, #6 + vshrn.u16 d17, q9, #6 + .endif ++ pld [r1, r2] + .ifc \type,avg + vld1.8 {d20}, [lr,:64], r2 + vld1.8 {d21}, [lr,:64], r2 + vrhadd.u8 q8, q8, q10 + .endif + subs r3, r3, #2 +- pld [r1] + vst1.8 {d16}, [r0,:64], r2 + vst1.8 {d17}, [r0,:64], r2 + bgt 3b +@@ -144,16 +135,13 @@ T cmp r7, #0 + vld1.8 {d6, d7}, [r1], r2 + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 +- +-5: pld [r1] ++ pld [r1] + subs r3, r3, #2 + vmull.u8 q8, d4, d0 + vmlal.u8 q8, d5, d1 +- vld1.8 {d4, d5}, [r1], r2 + vmull.u8 q9, d6, d0 + vmlal.u8 q9, d7, d1 +- pld [r1] +- vext.8 d5, d4, d5, #1 ++ pld [r1, r2] + .ifc \codec,h264 + vrshrn.u16 d16, q8, #6 + vrshrn.u16 d17, q9, #6 +@@ -168,11 +156,9 @@ T cmp r7, #0 + vld1.8 {d21}, [lr,:64], r2 + vrhadd.u8 q8, q8, q10 + .endif +- vld1.8 {d6, d7}, [r1], r2 +- vext.8 d7, d6, d7, #1 + vst1.8 {d16}, [r0,:64], r2 + vst1.8 {d17}, [r0,:64], r2 +- bgt 5b ++ bgt 4b + + pop {r4-r7, pc} + endfunc +@@ -209,33 +195,29 @@ T cmp r7, #0 + + beq 2f + +- add r5, r1, r2 +- + vdup.8 d0, r4 +- lsl r4, r2, #1 + vdup.8 d1, r12 +- vld1.8 {d4}, [r1], r4 ++ vld1.8 {d4}, [r1], r2 + vdup.8 d2, r6 +- vld1.8 {d6}, [r5], r4 + vdup.8 d3, r7 + + vext.8 d5, d4, d5, #1 +- vext.8 d7, d6, d7, #1 + vtrn.32 d4, d5 +- vtrn.32 d6, d7 + + vtrn.32 d0, d1 + vtrn.32 d2, d3 + +-1: pld [r5] ++1: vld1.8 {d6}, [r1], r2 ++ vext.8 d7, d6, d7, #1 ++ vtrn.32 d6, d7 + vmull.u8 q8, d4, d0 + vmlal.u8 q8, d6, d2 +- vld1.8 {d4}, [r1], r4 ++ vld1.8 {d4}, [r1], r2 + vext.8 d5, d4, d5, #1 + vtrn.32 d4, d5 ++ pld [r1] + vmull.u8 q9, d6, d0 + vmlal.u8 q9, d4, d2 +- vld1.8 {d6}, [r5], r4 + vadd.i16 d16, d16, d17 + vadd.i16 d17, d18, d19 + .ifc \codec,h264 +@@ -245,14 +227,12 @@ T cmp r7, #0 + vshrn.u16 d16, q8, #6 + .endif + subs r3, r3, #2 +- pld [r1] ++ pld [r1, r2] + .ifc \type,avg + vld1.32 {d20[0]}, [lr,:32], r2 + vld1.32 {d20[1]}, [lr,:32], r2 + vrhadd.u8 d16, d16, d20 + .endif +- vext.8 d7, d6, d7, #1 +- vtrn.32 d6, d7 + vst1.32 {d16[0]}, [r0,:32], r2 + vst1.32 {d16[1]}, [r0,:32], r2 + bgt 1b +@@ -268,18 +248,15 @@ T cmp r7, #0 + beq 4f + + vext.32 d1, d0, d1, #1 +- add r5, r1, r2 +- lsl r4, r2, #1 +- vld1.32 {d4[0]}, [r1], r4 +- vld1.32 {d4[1]}, [r5], r4 ++ vld1.32 {d4[0]}, [r1], r2 + +-3: pld [r5] ++3: vld1.32 {d4[1]}, [r1], r2 + vmull.u8 q8, d4, d0 +- vld1.32 {d4[0]}, [r1], r4 ++ vld1.32 {d4[0]}, [r1], r2 + vmull.u8 q9, d4, d1 +- vld1.32 {d4[1]}, [r5], r4 + vadd.i16 d16, d16, d17 + vadd.i16 d17, d18, d19 ++ pld [r1] + .ifc \codec,h264 + vrshrn.u16 d16, q8, #6 + .else +@@ -292,7 +269,7 @@ T cmp r7, #0 + vrhadd.u8 d16, d16, d20 + .endif + subs r3, r3, #2 +- pld [r1] ++ pld [r1, r2] + vst1.32 {d16[0]}, [r0,:32], r2 + vst1.32 {d16[1]}, [r0,:32], r2 + bgt 3b +@@ -305,13 +282,9 @@ T cmp r7, #0 + vext.8 d7, d6, d7, #1 + vtrn.32 d4, d5 + vtrn.32 d6, d7 +- +-5: vmull.u8 q8, d4, d0 ++ vmull.u8 q8, d4, d0 + vmull.u8 q9, d6, d0 + subs r3, r3, #2 +- vld1.8 {d4}, [r1], r2 +- vext.8 d5, d4, d5, #1 +- vtrn.32 d4, d5 + vadd.i16 d16, d16, d17 + vadd.i16 d17, d18, d19 + pld [r1] +@@ -326,13 +299,10 @@ T cmp r7, #0 + vld1.32 {d20[1]}, [lr,:32], r2 + vrhadd.u8 d16, d16, d20 + .endif +- vld1.8 {d6}, [r1], r2 +- vext.8 d7, d6, d7, #1 +- vtrn.32 d6, d7 + pld [r1] + vst1.32 {d16[0]}, [r0,:32], r2 + vst1.32 {d16[1]}, [r0,:32], r2 +- bgt 5b ++ bgt 4b + + pop {r4-r7, pc} + endfunc +-- +1.7.2.5 + |