diff options
author | Anton Fedchin <afedchin@ruswizards.com> | 2015-11-12 00:40:50 +0300 |
---|---|---|
committer | Anton Fedchin <afedchin@ruswizards.com> | 2015-11-12 00:40:50 +0300 |
commit | 38e24d704081aee80ff34868b226457c8800a946 (patch) | |
tree | be78bfc3deb32b29f6c183d62e62d11c08292f86 | |
parent | 0d8015f5d63e179af3f591f69fc55fbc90a2deef (diff) |
[win32/utils] Fixed converting yuv420 to nv12. Fixed trac #16217
-rw-r--r-- | xbmc/utils/win32/memcpy_sse2.h | 62 |
1 files changed, 37 insertions, 25 deletions
diff --git a/xbmc/utils/win32/memcpy_sse2.h b/xbmc/utils/win32/memcpy_sse2.h index c585136547..d5c5844b58 100644 --- a/xbmc/utils/win32/memcpy_sse2.h +++ b/xbmc/utils/win32/memcpy_sse2.h @@ -77,37 +77,49 @@ inline void convert_yuv420_nv12(uint8_t *const src[], const int srcStride[], int uint8_t * u = src[1] + line * srcStride[1]; uint8_t * v = src[2] + line * srcStride[2]; uint8_t * d = dst[1] + line * dstStride[1]; - for (i = 0; i < (chromaWidth - 31); i += 32) + // if memory is not aligned use memcpy + if (((size_t)(u) | (size_t)(v) | (size_t)(d)) & 0xF) { - xmm0 = _mm_load_si128((__m128i*)(v + i)); - xmm1 = _mm_load_si128((__m128i*)(u + i)); - xmm2 = _mm_load_si128((__m128i*)(v + i + 16)); - xmm3 = _mm_load_si128((__m128i*)(u + i + 16)); + for (i = 0; i < chromaWidth; ++i) + { + *d++ = *u++; + *d++ = *v++; + } + } + else + { + for (i = 0; i < (chromaWidth - 31); i += 32) + { + xmm0 = _mm_load_si128((__m128i*)(v + i)); + xmm1 = _mm_load_si128((__m128i*)(u + i)); + xmm2 = _mm_load_si128((__m128i*)(v + i + 16)); + xmm3 = _mm_load_si128((__m128i*)(u + i + 16)); - xmm4 = xmm0; - xmm0 = _mm_unpacklo_epi8(xmm1, xmm0); - xmm4 = _mm_unpackhi_epi8(xmm1, xmm4); + xmm4 = xmm0; + xmm0 = _mm_unpacklo_epi8(xmm1, xmm0); + xmm4 = _mm_unpackhi_epi8(xmm1, xmm4); - xmm1 = xmm2; - xmm2 = _mm_unpacklo_epi8(xmm3, xmm2); - xmm1 = _mm_unpackhi_epi8(xmm3, xmm1); + xmm1 = xmm2; + xmm2 = _mm_unpacklo_epi8(xmm3, xmm2); + xmm1 = _mm_unpackhi_epi8(xmm3, xmm1); - _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0); - _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm4); - _mm_stream_si128((__m128i *)(d + (i << 1) + 32), xmm2); - _mm_stream_si128((__m128i *)(d + (i << 1) + 48), xmm1); - } - for (; i < chromaWidth; i += 16) - { - xmm0 = _mm_load_si128((__m128i*)(v + i)); - xmm1 = _mm_load_si128((__m128i*)(u + i)); + _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0); + _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm4); + _mm_stream_si128((__m128i *)(d + (i << 1) + 32), xmm2); + _mm_stream_si128((__m128i *)(d + (i << 1) + 48), xmm1); + } + for (; i < chromaWidth; i += 16) + { + xmm0 = _mm_load_si128((__m128i*)(v + i)); + xmm1 = _mm_load_si128((__m128i*)(u + i)); - xmm2 = xmm0; - xmm0 = _mm_unpacklo_epi8(xmm1, xmm0); - xmm2 = _mm_unpackhi_epi8(xmm1, xmm2); + xmm2 = xmm0; + xmm0 = _mm_unpacklo_epi8(xmm1, xmm0); + xmm2 = _mm_unpackhi_epi8(xmm1, xmm2); - _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0); - _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm2); + _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0); + _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm2); + } } } } |