aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjenkins4kodi <jenkins4kodi@users.noreply.github.com>2015-11-11 23:28:09 +0100
committerjenkins4kodi <jenkins4kodi@users.noreply.github.com>2015-11-11 23:28:09 +0100
commita11c9e463293daee0f68bfbf311cff898d1626f5 (patch)
treebe78bfc3deb32b29f6c183d62e62d11c08292f86
parent0d8015f5d63e179af3f591f69fc55fbc90a2deef (diff)
parent38e24d704081aee80ff34868b226457c8800a946 (diff)
Merge pull request #8376 from afedchin/fix_dvd_dxva
-rw-r--r--xbmc/utils/win32/memcpy_sse2.h62
1 files changed, 37 insertions, 25 deletions
diff --git a/xbmc/utils/win32/memcpy_sse2.h b/xbmc/utils/win32/memcpy_sse2.h
index c585136547..d5c5844b58 100644
--- a/xbmc/utils/win32/memcpy_sse2.h
+++ b/xbmc/utils/win32/memcpy_sse2.h
@@ -77,37 +77,49 @@ inline void convert_yuv420_nv12(uint8_t *const src[], const int srcStride[], int
uint8_t * u = src[1] + line * srcStride[1];
uint8_t * v = src[2] + line * srcStride[2];
uint8_t * d = dst[1] + line * dstStride[1];
- for (i = 0; i < (chromaWidth - 31); i += 32)
+ // if memory is not aligned use memcpy
+ if (((size_t)(u) | (size_t)(v) | (size_t)(d)) & 0xF)
{
- xmm0 = _mm_load_si128((__m128i*)(v + i));
- xmm1 = _mm_load_si128((__m128i*)(u + i));
- xmm2 = _mm_load_si128((__m128i*)(v + i + 16));
- xmm3 = _mm_load_si128((__m128i*)(u + i + 16));
+ for (i = 0; i < chromaWidth; ++i)
+ {
+ *d++ = *u++;
+ *d++ = *v++;
+ }
+ }
+ else
+ {
+ for (i = 0; i < (chromaWidth - 31); i += 32)
+ {
+ xmm0 = _mm_load_si128((__m128i*)(v + i));
+ xmm1 = _mm_load_si128((__m128i*)(u + i));
+ xmm2 = _mm_load_si128((__m128i*)(v + i + 16));
+ xmm3 = _mm_load_si128((__m128i*)(u + i + 16));
- xmm4 = xmm0;
- xmm0 = _mm_unpacklo_epi8(xmm1, xmm0);
- xmm4 = _mm_unpackhi_epi8(xmm1, xmm4);
+ xmm4 = xmm0;
+ xmm0 = _mm_unpacklo_epi8(xmm1, xmm0);
+ xmm4 = _mm_unpackhi_epi8(xmm1, xmm4);
- xmm1 = xmm2;
- xmm2 = _mm_unpacklo_epi8(xmm3, xmm2);
- xmm1 = _mm_unpackhi_epi8(xmm3, xmm1);
+ xmm1 = xmm2;
+ xmm2 = _mm_unpacklo_epi8(xmm3, xmm2);
+ xmm1 = _mm_unpackhi_epi8(xmm3, xmm1);
- _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0);
- _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm4);
- _mm_stream_si128((__m128i *)(d + (i << 1) + 32), xmm2);
- _mm_stream_si128((__m128i *)(d + (i << 1) + 48), xmm1);
- }
- for (; i < chromaWidth; i += 16)
- {
- xmm0 = _mm_load_si128((__m128i*)(v + i));
- xmm1 = _mm_load_si128((__m128i*)(u + i));
+ _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0);
+ _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm4);
+ _mm_stream_si128((__m128i *)(d + (i << 1) + 32), xmm2);
+ _mm_stream_si128((__m128i *)(d + (i << 1) + 48), xmm1);
+ }
+ for (; i < chromaWidth; i += 16)
+ {
+ xmm0 = _mm_load_si128((__m128i*)(v + i));
+ xmm1 = _mm_load_si128((__m128i*)(u + i));
- xmm2 = xmm0;
- xmm0 = _mm_unpacklo_epi8(xmm1, xmm0);
- xmm2 = _mm_unpackhi_epi8(xmm1, xmm2);
+ xmm2 = xmm0;
+ xmm0 = _mm_unpacklo_epi8(xmm1, xmm0);
+ xmm2 = _mm_unpackhi_epi8(xmm1, xmm2);
- _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0);
- _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm2);
+ _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0);
+ _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm2);
+ }
}
}
}