aboutsummaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorMemphiz <memphis@machzwo.de>2012-10-20 14:15:28 +0200
committerMemphiz <memphis@machzwo.de>2012-10-20 14:20:29 +0200
commit7e89d79507500370d7283ad42e1d479e2e74dc1c (patch)
tree6eccda0425c808e4c3017374fe64c4a8ebeee3ef /lib
parentf2d217d7751195630f85702430df0a52e6ab886a (diff)
[ffmpeg] - patch file for "ARM: fix overreads in neon h264 chroma mc"
Diffstat (limited to 'lib')
-rw-r--r--lib/ffmpeg/patches/0034-ARM-fix-h264-chroma-mc-overrides.patch257
1 files changed, 257 insertions, 0 deletions
diff --git a/lib/ffmpeg/patches/0034-ARM-fix-h264-chroma-mc-overrides.patch b/lib/ffmpeg/patches/0034-ARM-fix-h264-chroma-mc-overrides.patch
new file mode 100644
index 0000000000..8b13712f3e
--- /dev/null
+++ b/lib/ffmpeg/patches/0034-ARM-fix-h264-chroma-mc-overrides.patch
@@ -0,0 +1,257 @@
+From 1846ddf0a7870e6862905121d4e9da1c283d6ff6 Mon Sep 17 00:00:00 2001
+From: Mans Rullgard <mans@mansr.com>
+Date: Fri, 19 Oct 2012 13:39:11 +0100
+Subject: [PATCH] ARM: fix overreads in neon h264 chroma mc
+
+The loops were reading ahead one line, which could end up outside the
+buffer for reference blocks at the edge of the picture. Removing
+this readahead has no measurable performance impact.
+
+Signed-off-by: Mans Rullgard <mans@mansr.com>
+---
+ libavcodec/arm/h264cmc_neon.S | 86 +++++++++++++---------------------------
+ 1 files changed, 28 insertions(+), 58 deletions(-)
+
+diff --git a/libavcodec/arm/h264cmc_neon.S b/libavcodec/arm/h264cmc_neon.S
+index c7e5460..3427e36 100644
+--- a/libavcodec/arm/h264cmc_neon.S
++++ b/libavcodec/arm/h264cmc_neon.S
+@@ -51,24 +51,20 @@ T cmp r7, #0
+
+ beq 2f
+
+- add r5, r1, r2
+-
+ vdup.8 d0, r4
+- lsl r4, r2, #1
+ vdup.8 d1, r12
+- vld1.8 {d4, d5}, [r1], r4
++ vld1.8 {d4, d5}, [r1], r2
+ vdup.8 d2, r6
+- vld1.8 {d6, d7}, [r5], r4
+ vdup.8 d3, r7
+-
+ vext.8 d5, d4, d5, #1
+- vext.8 d7, d6, d7, #1
+
+-1: pld [r5]
++1: vld1.8 {d6, d7}, [r1], r2
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d5, d1
+- vld1.8 {d4, d5}, [r1], r4
++ vext.8 d7, d6, d7, #1
++ vld1.8 {d4, d5}, [r1], r2
+ vmlal.u8 q8, d6, d2
++ pld [r1]
+ vext.8 d5, d4, d5, #1
+ vmlal.u8 q8, d7, d3
+ vmull.u8 q9, d6, d0
+@@ -76,8 +72,7 @@ T cmp r7, #0
+ vmlal.u8 q9, d7, d1
+ vmlal.u8 q9, d4, d2
+ vmlal.u8 q9, d5, d3
+- vld1.8 {d6, d7}, [r5], r4
+- pld [r1]
++ pld [r1, r2]
+ .ifc \codec,h264
+ vrshrn.u16 d16, q8, #6
+ vrshrn.u16 d17, q9, #6
+@@ -92,7 +87,6 @@ T cmp r7, #0
+ vld1.8 {d21}, [lr,:64], r2
+ vrhadd.u8 q8, q8, q10
+ .endif
+- vext.8 d7, d6, d7, #1
+ vst1.8 {d16}, [r0,:64], r2
+ vst1.8 {d17}, [r0,:64], r2
+ bgt 1b
+@@ -106,18 +100,15 @@ T cmp r7, #0
+
+ beq 4f
+
+- add r5, r1, r2
+- lsl r4, r2, #1
+- vld1.8 {d4}, [r1], r4
+- vld1.8 {d6}, [r5], r4
++ vld1.8 {d4}, [r1], r2
+
+-3: pld [r5]
++3: vld1.8 {d6}, [r1], r2
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d6, d1
+- vld1.8 {d4}, [r1], r4
++ vld1.8 {d4}, [r1], r2
+ vmull.u8 q9, d6, d0
+ vmlal.u8 q9, d4, d1
+- vld1.8 {d6}, [r5], r4
++ pld [r1]
+ .ifc \codec,h264
+ vrshrn.u16 d16, q8, #6
+ vrshrn.u16 d17, q9, #6
+@@ -127,13 +118,13 @@ T cmp r7, #0
+ vshrn.u16 d16, q8, #6
+ vshrn.u16 d17, q9, #6
+ .endif
++ pld [r1, r2]
+ .ifc \type,avg
+ vld1.8 {d20}, [lr,:64], r2
+ vld1.8 {d21}, [lr,:64], r2
+ vrhadd.u8 q8, q8, q10
+ .endif
+ subs r3, r3, #2
+- pld [r1]
+ vst1.8 {d16}, [r0,:64], r2
+ vst1.8 {d17}, [r0,:64], r2
+ bgt 3b
+@@ -144,16 +135,13 @@ T cmp r7, #0
+ vld1.8 {d6, d7}, [r1], r2
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+-
+-5: pld [r1]
++ pld [r1]
+ subs r3, r3, #2
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d5, d1
+- vld1.8 {d4, d5}, [r1], r2
+ vmull.u8 q9, d6, d0
+ vmlal.u8 q9, d7, d1
+- pld [r1]
+- vext.8 d5, d4, d5, #1
++ pld [r1, r2]
+ .ifc \codec,h264
+ vrshrn.u16 d16, q8, #6
+ vrshrn.u16 d17, q9, #6
+@@ -168,11 +156,9 @@ T cmp r7, #0
+ vld1.8 {d21}, [lr,:64], r2
+ vrhadd.u8 q8, q8, q10
+ .endif
+- vld1.8 {d6, d7}, [r1], r2
+- vext.8 d7, d6, d7, #1
+ vst1.8 {d16}, [r0,:64], r2
+ vst1.8 {d17}, [r0,:64], r2
+- bgt 5b
++ bgt 4b
+
+ pop {r4-r7, pc}
+ endfunc
+@@ -209,33 +195,29 @@ T cmp r7, #0
+
+ beq 2f
+
+- add r5, r1, r2
+-
+ vdup.8 d0, r4
+- lsl r4, r2, #1
+ vdup.8 d1, r12
+- vld1.8 {d4}, [r1], r4
++ vld1.8 {d4}, [r1], r2
+ vdup.8 d2, r6
+- vld1.8 {d6}, [r5], r4
+ vdup.8 d3, r7
+
+ vext.8 d5, d4, d5, #1
+- vext.8 d7, d6, d7, #1
+ vtrn.32 d4, d5
+- vtrn.32 d6, d7
+
+ vtrn.32 d0, d1
+ vtrn.32 d2, d3
+
+-1: pld [r5]
++1: vld1.8 {d6}, [r1], r2
++ vext.8 d7, d6, d7, #1
++ vtrn.32 d6, d7
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d6, d2
+- vld1.8 {d4}, [r1], r4
++ vld1.8 {d4}, [r1], r2
+ vext.8 d5, d4, d5, #1
+ vtrn.32 d4, d5
++ pld [r1]
+ vmull.u8 q9, d6, d0
+ vmlal.u8 q9, d4, d2
+- vld1.8 {d6}, [r5], r4
+ vadd.i16 d16, d16, d17
+ vadd.i16 d17, d18, d19
+ .ifc \codec,h264
+@@ -245,14 +227,12 @@ T cmp r7, #0
+ vshrn.u16 d16, q8, #6
+ .endif
+ subs r3, r3, #2
+- pld [r1]
++ pld [r1, r2]
+ .ifc \type,avg
+ vld1.32 {d20[0]}, [lr,:32], r2
+ vld1.32 {d20[1]}, [lr,:32], r2
+ vrhadd.u8 d16, d16, d20
+ .endif
+- vext.8 d7, d6, d7, #1
+- vtrn.32 d6, d7
+ vst1.32 {d16[0]}, [r0,:32], r2
+ vst1.32 {d16[1]}, [r0,:32], r2
+ bgt 1b
+@@ -268,18 +248,15 @@ T cmp r7, #0
+ beq 4f
+
+ vext.32 d1, d0, d1, #1
+- add r5, r1, r2
+- lsl r4, r2, #1
+- vld1.32 {d4[0]}, [r1], r4
+- vld1.32 {d4[1]}, [r5], r4
++ vld1.32 {d4[0]}, [r1], r2
+
+-3: pld [r5]
++3: vld1.32 {d4[1]}, [r1], r2
+ vmull.u8 q8, d4, d0
+- vld1.32 {d4[0]}, [r1], r4
++ vld1.32 {d4[0]}, [r1], r2
+ vmull.u8 q9, d4, d1
+- vld1.32 {d4[1]}, [r5], r4
+ vadd.i16 d16, d16, d17
+ vadd.i16 d17, d18, d19
++ pld [r1]
+ .ifc \codec,h264
+ vrshrn.u16 d16, q8, #6
+ .else
+@@ -292,7 +269,7 @@ T cmp r7, #0
+ vrhadd.u8 d16, d16, d20
+ .endif
+ subs r3, r3, #2
+- pld [r1]
++ pld [r1, r2]
+ vst1.32 {d16[0]}, [r0,:32], r2
+ vst1.32 {d16[1]}, [r0,:32], r2
+ bgt 3b
+@@ -305,13 +282,9 @@ T cmp r7, #0
+ vext.8 d7, d6, d7, #1
+ vtrn.32 d4, d5
+ vtrn.32 d6, d7
+-
+-5: vmull.u8 q8, d4, d0
++ vmull.u8 q8, d4, d0
+ vmull.u8 q9, d6, d0
+ subs r3, r3, #2
+- vld1.8 {d4}, [r1], r2
+- vext.8 d5, d4, d5, #1
+- vtrn.32 d4, d5
+ vadd.i16 d16, d16, d17
+ vadd.i16 d17, d18, d19
+ pld [r1]
+@@ -326,13 +299,10 @@ T cmp r7, #0
+ vld1.32 {d20[1]}, [lr,:32], r2
+ vrhadd.u8 d16, d16, d20
+ .endif
+- vld1.8 {d6}, [r1], r2
+- vext.8 d7, d6, d7, #1
+- vtrn.32 d6, d7
+ pld [r1]
+ vst1.32 {d16[0]}, [r0,:32], r2
+ vst1.32 {d16[1]}, [r0,:32], r2
+- bgt 5b
++ bgt 4b
+
+ pop {r4-r7, pc}
+ endfunc
+--
+1.7.2.5
+