From 870f6007c174c587c220e4ca99f7888db943e413 Mon Sep 17 00:00:00 2001 From: Anton Fedchin Date: Wed, 4 Apr 2018 09:44:34 +0300 Subject: [win32] utils: move to platform folder --- cmake/treedata/windows/subdirs.txt | 2 +- cmake/treedata/windowsstore/subdirs.txt | 2 +- .../VideoPlayer/VideoRenderers/WinRenderBuffer.cpp | 4 +- xbmc/platform/win32/utils/CMakeLists.txt | 7 + .../platform/win32/utils/Win32InterfaceForCLog.cpp | 128 ++++++++ xbmc/platform/win32/utils/Win32InterfaceForCLog.h | 38 +++ xbmc/platform/win32/utils/gpu_memcpy_sse4.h | 129 ++++++++ xbmc/platform/win32/utils/memcpy_sse2.h | 333 +++++++++++++++++++++ xbmc/utils/log.cpp | 2 +- xbmc/utils/win32/CMakeLists.txt | 7 - xbmc/utils/win32/Win32InterfaceForCLog.cpp | 128 -------- xbmc/utils/win32/Win32InterfaceForCLog.h | 38 --- xbmc/utils/win32/gpu_memcpy_sse4.h | 129 -------- xbmc/utils/win32/memcpy_sse2.h | 333 --------------------- 14 files changed, 640 insertions(+), 640 deletions(-) create mode 100644 xbmc/platform/win32/utils/CMakeLists.txt create mode 100644 xbmc/platform/win32/utils/Win32InterfaceForCLog.cpp create mode 100644 xbmc/platform/win32/utils/Win32InterfaceForCLog.h create mode 100644 xbmc/platform/win32/utils/gpu_memcpy_sse4.h create mode 100644 xbmc/platform/win32/utils/memcpy_sse2.h delete mode 100644 xbmc/utils/win32/CMakeLists.txt delete mode 100644 xbmc/utils/win32/Win32InterfaceForCLog.cpp delete mode 100644 xbmc/utils/win32/Win32InterfaceForCLog.h delete mode 100644 xbmc/utils/win32/gpu_memcpy_sse4.h delete mode 100644 xbmc/utils/win32/memcpy_sse2.h diff --git a/cmake/treedata/windows/subdirs.txt b/cmake/treedata/windows/subdirs.txt index 8294c3ccf2..a2c8c29f14 100644 --- a/cmake/treedata/windows/subdirs.txt +++ b/cmake/treedata/windows/subdirs.txt @@ -5,10 +5,10 @@ xbmc/platform/win32/network platform/win32/network xbmc/platform/win32/peripherals platform/win32/peripherals xbmc/platform/win32/powermanagement platform/win32/powermanagement xbmc/platform/win32/storage platform/win32/storage +xbmc/platform/win32/utils platform/win32/utils xbmc/input/touch input/touch xbmc/input/touch/generic input/touch/generic xbmc/network/mdns network/mdns -xbmc/utils/win32 utils/win32 xbmc/rendering/dx rendering/dx xbmc/threads/platform/win threads/win xbmc/windowing/windows windowing/windows diff --git a/cmake/treedata/windowsstore/subdirs.txt b/cmake/treedata/windowsstore/subdirs.txt index 1084447281..604be08cd3 100644 --- a/cmake/treedata/windowsstore/subdirs.txt +++ b/cmake/treedata/windowsstore/subdirs.txt @@ -5,10 +5,10 @@ xbmc/platform/win10/peripherals platform/win10/peripherals xbmc/platform/win10/powermanagement platfrom/win10/powermanagement xbmc/platform/win10/storage platfrom/win10/storage xbmc/platform/win32/filesystem platform/win32/filesystem +xbmc/platform/win32/utils platform/win32/utils xbmc/input/touch input/touch xbmc/input/touch/generic input/touch/generic xbmc/network/mdns network/mdns -xbmc/utils/win32 utils/win32 xbmc/rendering/dx rendering/dx xbmc/threads/platform/win threads/win xbmc/windowing/win10 windowing/win10 diff --git a/xbmc/cores/VideoPlayer/VideoRenderers/WinRenderBuffer.cpp b/xbmc/cores/VideoPlayer/VideoRenderers/WinRenderBuffer.cpp index 147f570d94..f8aaca8a5e 100644 --- a/xbmc/cores/VideoPlayer/VideoRenderers/WinRenderBuffer.cpp +++ b/xbmc/cores/VideoPlayer/VideoRenderers/WinRenderBuffer.cpp @@ -28,9 +28,9 @@ #include "rendering/dx/RenderContext.h" #include "utils/log.h" #if defined(HAVE_SSE2) -#include "utils/win32/gpu_memcpy_sse4.h" +#include "platform/win32/utils/gpu_memcpy_sse4.h" #endif -#include "utils/win32/memcpy_sse2.h" +#include "platform/win32/utils/memcpy_sse2.h" #include "utils/CPUInfo.h" #define PLANE_Y 0 diff --git a/xbmc/platform/win32/utils/CMakeLists.txt b/xbmc/platform/win32/utils/CMakeLists.txt new file mode 100644 index 0000000000..3c32cb4986 --- /dev/null +++ b/xbmc/platform/win32/utils/CMakeLists.txt @@ -0,0 +1,7 @@ +set(SOURCES Win32InterfaceForCLog.cpp) + +set(HEADERS gpu_memcpy_sse4.h + memcpy_sse2.h + Win32InterfaceForCLog.h) + +core_add_library(platform_win32_utils) diff --git a/xbmc/platform/win32/utils/Win32InterfaceForCLog.cpp b/xbmc/platform/win32/utils/Win32InterfaceForCLog.cpp new file mode 100644 index 0000000000..0a5cf9a43f --- /dev/null +++ b/xbmc/platform/win32/utils/Win32InterfaceForCLog.cpp @@ -0,0 +1,128 @@ +/* +* Copyright (C) 2014 Team XBMC +* http://kodi.tv +* +* This Program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2, or (at your option) +* any later version. +* +* This Program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with XBMC; see the file COPYING. If not, see +* . +* +*/ + +#ifndef TARGET_WINDOWS +#error This file is for win32 platforms only +#endif //!TARGET_WINDOWS + +#include "Win32InterfaceForCLog.h" +#include "platform/win32/WIN32Util.h" +#include "utils/StringUtils.h" +#include "utils/auto_buffer.h" + +#include + +CWin32InterfaceForCLog::CWin32InterfaceForCLog() : + m_hFile(INVALID_HANDLE_VALUE) +{ } + +CWin32InterfaceForCLog::~CWin32InterfaceForCLog() +{ + if (m_hFile != INVALID_HANDLE_VALUE) + CloseHandle(m_hFile); +} + +bool CWin32InterfaceForCLog::OpenLogFile(const std::string& logFilename, const std::string& backupOldLogToFilename) +{ + if (m_hFile != INVALID_HANDLE_VALUE) + return false; // file was already opened + + std::wstring strLogFileW(CWIN32Util::ConvertPathToWin32Form(CWIN32Util::SmbToUnc(logFilename))); + std::wstring strLogFileOldW(CWIN32Util::ConvertPathToWin32Form(CWIN32Util::SmbToUnc(backupOldLogToFilename))); + + if (strLogFileW.empty()) + return false; + + if (!strLogFileOldW.empty()) + { + (void)DeleteFileW(strLogFileOldW.c_str()); // if it's failed, try to continue +#ifdef TARGET_WINDOWS_STORE + (void)MoveFileEx(strLogFileW.c_str(), strLogFileOldW.c_str(), MOVEFILE_REPLACE_EXISTING); // if it's failed, try to continue +#else + (void)MoveFileW(strLogFileW.c_str(), strLogFileOldW.c_str()); // if it's failed, try to continue +#endif + } + +#ifdef TARGET_WINDOWS_STORE + m_hFile = CreateFile2(strLogFileW.c_str(), GENERIC_WRITE, FILE_SHARE_READ, + CREATE_ALWAYS, NULL); +#else + m_hFile = CreateFileW(strLogFileW.c_str(), GENERIC_WRITE, FILE_SHARE_READ, NULL, + CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); +#endif + + if (m_hFile == INVALID_HANDLE_VALUE) + return false; + + static const unsigned char BOM[3] = { 0xEF, 0xBB, 0xBF }; + DWORD written; + (void)WriteFile(m_hFile, BOM, sizeof(BOM), &written, NULL); // write BOM, ignore possible errors + (void)FlushFileBuffers(m_hFile); + + return true; +} + +void CWin32InterfaceForCLog::CloseLogFile(void) +{ + if (m_hFile != INVALID_HANDLE_VALUE) + { + CloseHandle(m_hFile); + m_hFile = INVALID_HANDLE_VALUE; + } +} + +bool CWin32InterfaceForCLog::WriteStringToLog(const std::string& logString) +{ + if (m_hFile == INVALID_HANDLE_VALUE) + return false; + + std::string strData(logString); + StringUtils::Replace(strData, "\n", "\r\n"); + strData += "\r\n"; + + DWORD written; + const bool ret = (WriteFile(m_hFile, strData.c_str(), strData.length(), &written, NULL) != 0) && written == strData.length(); + + return ret; +} + +void CWin32InterfaceForCLog::PrintDebugString(const std::string& debugString) +{ +#ifdef _DEBUG + ::OutputDebugStringW(L"Debug Print: "); + int bufSize = MultiByteToWideChar(CP_UTF8, 0, debugString.c_str(), debugString.length(), NULL, 0); + XUTILS::auto_buffer buf(sizeof(wchar_t) * (bufSize + 1)); // '+1' for extra safety + if (MultiByteToWideChar(CP_UTF8, 0, debugString.c_str(), debugString.length(), (wchar_t*)buf.get(), buf.size() / sizeof(wchar_t)) == bufSize) + ::OutputDebugStringW(std::wstring((wchar_t*)buf.get(), bufSize).c_str()); + else + ::OutputDebugStringA(debugString.c_str()); + ::OutputDebugStringW(L"\n"); +#endif // _DEBUG +} + +void CWin32InterfaceForCLog::GetCurrentLocalTime(int& hour, int& minute, int& second, double& millisecond) +{ + SYSTEMTIME time; + GetLocalTime(&time); + hour = time.wHour; + minute = time.wMinute; + second = time.wSecond; + millisecond = static_cast(time.wMilliseconds); +} diff --git a/xbmc/platform/win32/utils/Win32InterfaceForCLog.h b/xbmc/platform/win32/utils/Win32InterfaceForCLog.h new file mode 100644 index 0000000000..0397c154df --- /dev/null +++ b/xbmc/platform/win32/utils/Win32InterfaceForCLog.h @@ -0,0 +1,38 @@ +#pragma once +/* +* Copyright (C) 2014 Team XBMC +* http://kodi.tv +* +* This Program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2, or (at your option) +* any later version. +* +* This Program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with XBMC; see the file COPYING. If not, see +* . +* +*/ + +#include + +typedef void* HANDLE; // forward declaration, to avoid inclusion of whole Windows.h + +class CWin32InterfaceForCLog +{ +public: + CWin32InterfaceForCLog(); + ~CWin32InterfaceForCLog(); + bool OpenLogFile(const std::string& logFilename, const std::string& backupOldLogToFilename); + void CloseLogFile(void); + bool WriteStringToLog(const std::string& logString); + void PrintDebugString(const std::string& debugString); + static void GetCurrentLocalTime(int& hour, int& minute, int& second, double& millisecond); +private: + HANDLE m_hFile; +}; diff --git a/xbmc/platform/win32/utils/gpu_memcpy_sse4.h b/xbmc/platform/win32/utils/gpu_memcpy_sse4.h new file mode 100644 index 0000000000..b5eb89896b --- /dev/null +++ b/xbmc/platform/win32/utils/gpu_memcpy_sse4.h @@ -0,0 +1,129 @@ +/* + * Copyright (C) 2011-2015 Hendrik Leppkes + * http://www.1f0.de + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Taken from the QuickSync decoder by Eric Gur + */ + +#include +#include + +// gpu_memcpy is a memcpy style function that copied data very fast from a +// GPU tiled memory (write back) +// Performance tip: page offset (12 lsb) of both addresses should be different +// optimally use a 2K offset between them. +inline void* gpu_memcpy(void* d, const void* s, size_t size) +{ + static const size_t regsInLoop = sizeof(size_t) * 2; // 8 or 16 + + if (d == nullptr || s == nullptr) return nullptr; + + // If memory is not aligned, use memcpy + bool isAligned = (((size_t)(s) | (size_t)(d)) & 0xF) == 0; + if (!isAligned) + { + return memcpy(d, s, size); + } + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; +#ifdef _M_X64 + __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; +#endif + + size_t reminder = size & (regsInLoop * sizeof(xmm0) - 1); // Copy 128 or 256 bytes every loop + size_t end = 0; + + __m128i* pTrg = (__m128i*)d; + __m128i* pTrgEnd = pTrg + ((size - reminder) >> 4); + __m128i* pSrc = (__m128i*)s; + + // Make sure source is synced - doesn't hurt if not needed. + _mm_sfence(); + + while (pTrg < pTrgEnd) + { + // _mm_stream_load_si128 emits the Streaming SIMD Extensions 4 (SSE4.1) instruction MOVNTDQA + // Fastest method for copying GPU RAM. Available since Penryn (45nm Core 2 Duo/Quad) + xmm0 = _mm_stream_load_si128(pSrc); + xmm1 = _mm_stream_load_si128(pSrc + 1); + xmm2 = _mm_stream_load_si128(pSrc + 2); + xmm3 = _mm_stream_load_si128(pSrc + 3); + xmm4 = _mm_stream_load_si128(pSrc + 4); + xmm5 = _mm_stream_load_si128(pSrc + 5); + xmm6 = _mm_stream_load_si128(pSrc + 6); + xmm7 = _mm_stream_load_si128(pSrc + 7); +#ifdef _M_X64 // Use all 16 xmm registers + xmm8 = _mm_stream_load_si128(pSrc + 8); + xmm9 = _mm_stream_load_si128(pSrc + 9); + xmm10 = _mm_stream_load_si128(pSrc + 10); + xmm11 = _mm_stream_load_si128(pSrc + 11); + xmm12 = _mm_stream_load_si128(pSrc + 12); + xmm13 = _mm_stream_load_si128(pSrc + 13); + xmm14 = _mm_stream_load_si128(pSrc + 14); + xmm15 = _mm_stream_load_si128(pSrc + 15); +#endif + pSrc += regsInLoop; + // _mm_store_si128 emit the SSE2 instruction MOVDQA (aligned store) + _mm_store_si128(pTrg , xmm0); + _mm_store_si128(pTrg + 1, xmm1); + _mm_store_si128(pTrg + 2, xmm2); + _mm_store_si128(pTrg + 3, xmm3); + _mm_store_si128(pTrg + 4, xmm4); + _mm_store_si128(pTrg + 5, xmm5); + _mm_store_si128(pTrg + 6, xmm6); + _mm_store_si128(pTrg + 7, xmm7); +#ifdef _M_X64 // Use all 16 xmm registers + _mm_store_si128(pTrg + 8, xmm8); + _mm_store_si128(pTrg + 9, xmm9); + _mm_store_si128(pTrg + 10, xmm10); + _mm_store_si128(pTrg + 11, xmm11); + _mm_store_si128(pTrg + 12, xmm12); + _mm_store_si128(pTrg + 13, xmm13); + _mm_store_si128(pTrg + 14, xmm14); + _mm_store_si128(pTrg + 15, xmm15); +#endif + pTrg += regsInLoop; + } + + // Copy in 16 byte steps + if (reminder >= 16) + { + size = reminder; + reminder = size & 15; + end = size >> 4; + for (size_t i = 0; i < end; ++i) + { + pTrg[i] = _mm_stream_load_si128(pSrc + i); + } + } + + // Copy last bytes - shouldn't happen as strides are modulo 16 + if (reminder) + { + __m128i temp = _mm_stream_load_si128(pSrc + end); + + char* ps = (char*)(&temp); + char* pt = (char*)(pTrg + end); + + for (size_t i = 0; i < reminder; ++i) + { + pt[i] = ps[i]; + } + } + + return d; +} \ No newline at end of file diff --git a/xbmc/platform/win32/utils/memcpy_sse2.h b/xbmc/platform/win32/utils/memcpy_sse2.h new file mode 100644 index 0000000000..13ee1ac5b0 --- /dev/null +++ b/xbmc/platform/win32/utils/memcpy_sse2.h @@ -0,0 +1,333 @@ +/* + * Copyright (C) 2005-2015 Team Kodi + * http://kodi.tv + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#pragma once +#if defined(HAVE_SSE2) +#include +#endif + +inline void* memcpy_aligned(void* dst, const void* src, size_t size, uint8_t bpp = 0) +{ + const uint8_t shift = 16 - bpp; +#if defined(HAVE_SSE2) + __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; +#ifdef _M_X64 + __m128i xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm16; +#endif +#endif + +#if defined(HAVE_SSE2) + // if memory is not aligned, use memcpy + if ((((size_t)(src) | (size_t)(dst)) & 0xF)) +#endif + { + if (bpp == 0 || bpp == 16) + return memcpy(dst, src, size); + else + { + uint16_t * y = (uint16_t*)(src); + uint16_t * d = (uint16_t*)(dst); + for (size_t x = 0; x < (size >> 1); x++) + { + d[x] = y[x] << shift; + } + return dst; + } + } +#if defined(HAVE_SSE2) + static const size_t regsInLoop = sizeof(size_t) * 2; // 8 or 16 + size_t reminder = size & (regsInLoop * sizeof(xmm1) - 1); // Copy 128 or 256 bytes every loop + size_t end = 0; + + __m128i* pTrg = (__m128i*)dst; + __m128i* pTrgEnd = pTrg + ((size - reminder) >> 4); + __m128i* pSrc = (__m128i*)src; + + _mm_sfence(); + + while(pTrg < pTrgEnd) + //for (i = 0; i < size - 63; i += 64) + { + xmm1 = _mm_load_si128(pSrc); + xmm2 = _mm_load_si128(pSrc + 1); + xmm3 = _mm_load_si128(pSrc + 2); + xmm4 = _mm_load_si128(pSrc + 3); + xmm5 = _mm_load_si128(pSrc + 4); + xmm6 = _mm_load_si128(pSrc + 5); + xmm7 = _mm_load_si128(pSrc + 6); + xmm8 = _mm_load_si128(pSrc + 7); +#ifdef _M_X64 // Use all 16 xmm registers + xmm9 = _mm_load_si128(pSrc + 8); + xmm10 = _mm_load_si128(pSrc + 9); + xmm11 = _mm_load_si128(pSrc + 10); + xmm12 = _mm_load_si128(pSrc + 11); + xmm13 = _mm_load_si128(pSrc + 12); + xmm14 = _mm_load_si128(pSrc + 13); + xmm15 = _mm_load_si128(pSrc + 14); + xmm16 = _mm_load_si128(pSrc + 15); +#endif + pSrc += regsInLoop; + + if (bpp != 0 && bpp != 16) + { + xmm1 = _mm_slli_epi16(xmm1, shift); + xmm2 = _mm_slli_epi16(xmm2, shift); + xmm3 = _mm_slli_epi16(xmm3, shift); + xmm4 = _mm_slli_epi16(xmm4, shift); + xmm5 = _mm_slli_epi16(xmm5, shift); + xmm6 = _mm_slli_epi16(xmm6, shift); + xmm7 = _mm_slli_epi16(xmm7, shift); + xmm8 = _mm_slli_epi16(xmm8, shift); +#ifdef _M_X64 // Use all 16 xmm registers + xmm9 = _mm_slli_epi16(xmm9, shift); + xmm10 = _mm_slli_epi16(xmm10, shift); + xmm11 = _mm_slli_epi16(xmm11, shift); + xmm12 = _mm_slli_epi16(xmm12, shift); + xmm13 = _mm_slli_epi16(xmm13, shift); + xmm14 = _mm_slli_epi16(xmm14, shift); + xmm15 = _mm_slli_epi16(xmm15, shift); + xmm16 = _mm_slli_epi16(xmm16, shift); +#endif + } + + _mm_stream_si128(pTrg, xmm1); + _mm_stream_si128(pTrg + 1, xmm2); + _mm_stream_si128(pTrg + 2, xmm3); + _mm_stream_si128(pTrg + 3, xmm4); + _mm_stream_si128(pTrg + 4, xmm5); + _mm_stream_si128(pTrg + 5, xmm6); + _mm_stream_si128(pTrg + 6, xmm7); + _mm_stream_si128(pTrg + 7, xmm8); +#ifdef _M_X64 // Use all 16 xmm registers + _mm_stream_si128(pTrg + 8, xmm9); + _mm_stream_si128(pTrg + 9, xmm10); + _mm_stream_si128(pTrg + 10, xmm11); + _mm_stream_si128(pTrg + 11, xmm12); + _mm_stream_si128(pTrg + 12, xmm13); + _mm_stream_si128(pTrg + 13, xmm14); + _mm_stream_si128(pTrg + 14, xmm15); + _mm_stream_si128(pTrg + 15, xmm16); +#endif + pTrg += regsInLoop; + } + + if (reminder >= 16) + { + size = reminder; + reminder = size & 15; + end = size >> 4; + for (size_t i = 0; i < end; ++i) + { + xmm1 = _mm_load_si128(pSrc + i); + if (bpp != 0 && bpp != 16) + xmm1 = _mm_slli_epi16(xmm1, shift); + _mm_store_si128(pTrg + i, xmm1); + } + } + + if (reminder) + { + __m128i temp = _mm_load_si128(pSrc + end); + char* ps = (char*)(&temp); + char* pt = (char*)(pTrg + end); + for (size_t i = 0; i < reminder; ++i) + { + pt[i] = ps[i] << shift; + } + } + return dst; +#endif +} + +inline void copy_plane(uint8_t *const src, const int srcStride, int height, int width, uint8_t *const dst, const int dstStride, uint8_t bpp = 0) +{ +#if defined(HAVE_SSE2) + _mm_sfence(); +#endif + + if (srcStride == dstStride) + memcpy_aligned(dst, src, srcStride * height, bpp); + else + { + for (size_t line = 0; line < height; ++line) + { + uint8_t * s = src + srcStride * line; + uint8_t * d = dst + dstStride * line; + memcpy_aligned(d, s, srcStride, bpp); + } + } +} + +inline void convert_yuv420_nv12_chrome(uint8_t *const *src, const int *srcStride, int height, int width, uint8_t *const dst, const int dstStride) +{ +#if defined(HAVE_SSE2) + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + _mm_sfence(); +#endif + + const size_t chroma_width = (width + 1) >> 1; + const size_t chromaHeight = height >> 1; + size_t line, i; + + for (line = 0; line < chromaHeight; ++line) + { + uint8_t * u = src[0] + line * srcStride[0]; + uint8_t * v = src[1] + line * srcStride[1]; + uint8_t * d = dst + line * dstStride; + + // if memory is not aligned use memcpy +#if defined(HAVE_SSE2) + if (((size_t)(u) | (size_t)(v) | (size_t)(d)) & 0xF) +#endif + { + for (i = 0; i < chroma_width; ++i) + { + *d++ = *u++; + *d++ = *v++; + } + } +#if defined(HAVE_SSE2) + else + { + for (i = 0; i < (chroma_width - 31); i += 32) + { + xmm0 = _mm_load_si128((__m128i*)(v + i)); + xmm1 = _mm_load_si128((__m128i*)(u + i)); + xmm2 = _mm_load_si128((__m128i*)(v + i + 16)); + xmm3 = _mm_load_si128((__m128i*)(u + i + 16)); + + xmm4 = xmm0; + xmm0 = _mm_unpacklo_epi8(xmm1, xmm0); + xmm4 = _mm_unpackhi_epi8(xmm1, xmm4); + + xmm1 = xmm2; + xmm2 = _mm_unpacklo_epi8(xmm3, xmm2); + xmm1 = _mm_unpackhi_epi8(xmm3, xmm1); + + _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0); + _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm4); + _mm_stream_si128((__m128i *)(d + (i << 1) + 32), xmm2); + _mm_stream_si128((__m128i *)(d + (i << 1) + 48), xmm1); + } + if (((size_t)chroma_width) & 0xF) + { + d += (i << 1); + u += i; v += i; + for (; i < chroma_width; ++i) + { + *d++ = *u++; + *d++ = *v++; + } + } + else if (i < chroma_width) + { + xmm0 = _mm_load_si128((__m128i*)(v + i)); + xmm1 = _mm_load_si128((__m128i*)(u + i)); + + xmm2 = xmm0; + xmm0 = _mm_unpacklo_epi8(xmm1, xmm0); + xmm2 = _mm_unpackhi_epi8(xmm1, xmm2); + + _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0); + _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm2); + } + } +#endif + } +} + +inline void convert_yuv420_p01x_chrome(uint8_t *const *src, const int *srcStride, int height, int width, uint8_t *const dst, const int dstStride, uint8_t bpp) +{ + const uint8_t shift = 16 - bpp; +#if defined(HAVE_SSE2) + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + _mm_sfence(); +#endif + + // Convert to P01x - Chroma + const size_t chromaWidth = (width + 1) >> 1; + const size_t chromaHeight = height >> 1; + size_t line, i; + + for (line = 0; line < chromaHeight; ++line) + { + uint16_t * u = (uint16_t*)(src[0] + line * srcStride[0]); + uint16_t * v = (uint16_t*)(src[1] + line * srcStride[1]); + uint16_t * d = (uint16_t*)(dst + line * dstStride); + + // if memory is not aligned use memcpy +#if defined(HAVE_SSE2) + if (((size_t)(u) | (size_t)(v) | (size_t)(d)) & 0xF) +#endif + { + for (i = 0; i < chromaWidth; ++i) + { + *d++ = *u++ << shift; + *d++ = *v++ << shift; + } + } +#if defined(HAVE_SSE2) + else + { + for (i = 0; i < chromaWidth; i += 16) + { + xmm0 = _mm_load_si128((__m128i*)(v + i)); + xmm1 = _mm_load_si128((__m128i*)(u + i)); + xmm2 = _mm_load_si128((__m128i*)(v + i + 8)); + xmm3 = _mm_load_si128((__m128i*)(u + i + 8)); + + xmm0 = _mm_slli_epi16(xmm0, shift); + xmm1 = _mm_slli_epi16(xmm1, shift); + xmm2 = _mm_slli_epi16(xmm2, shift); + xmm3 = _mm_slli_epi16(xmm3, shift); + + xmm4 = xmm0; + xmm0 = _mm_unpacklo_epi16(xmm1, xmm0); + xmm4 = _mm_unpackhi_epi16(xmm1, xmm4); + + xmm1 = xmm2; + xmm2 = _mm_unpacklo_epi16(xmm3, xmm2); + xmm1 = _mm_unpackhi_epi16(xmm3, xmm1); + + _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0); + _mm_stream_si128((__m128i *)(d + (i << 1) + 8), xmm4); + _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm2); + _mm_stream_si128((__m128i *)(d + (i << 1) + 24), xmm1); + } + } +#endif + } +} + +inline void convert_yuv420_nv12(uint8_t *const src[], const int srcStride[], int height, int width, uint8_t *const dst[], const int dstStride[]) +{ + // Convert to NV12 - Luma + copy_plane(src[0], srcStride[0], height, width, dst[0], dstStride[0]); + // Convert to NV12 - Chroma + convert_yuv420_nv12_chrome(&src[1], &srcStride[1], height, width, dst[1], dstStride[1]); +} + +inline void convert_yuv420_p01x(uint8_t *const src[], const int srcStride[], int height, int width, uint8_t *const dst[], const int dstStride[], uint8_t bpp) +{ + // Convert to P01x - Luma + copy_plane(src[0], srcStride[0], height, width, dst[0], dstStride[0], bpp); + // Convert to P01x - Chroma + convert_yuv420_p01x_chrome(&src[1], &srcStride[1], height, width, dst[1], dstStride[1], bpp); +} diff --git a/xbmc/utils/log.cpp b/xbmc/utils/log.cpp index 3223f88826..e38037a81d 100644 --- a/xbmc/utils/log.cpp +++ b/xbmc/utils/log.cpp @@ -30,7 +30,7 @@ #include "platform/posix/utils/PosixInterfaceForCLog.h" typedef class CPosixInterfaceForCLog PlatformInterfaceForCLog; #elif defined(TARGET_WINDOWS) -#include "win32/Win32InterfaceForCLog.h" +#include "platform/win32/utils/Win32InterfaceForCLog.h" typedef class CWin32InterfaceForCLog PlatformInterfaceForCLog; #endif diff --git a/xbmc/utils/win32/CMakeLists.txt b/xbmc/utils/win32/CMakeLists.txt deleted file mode 100644 index 3f71e0bfc2..0000000000 --- a/xbmc/utils/win32/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -set(SOURCES Win32InterfaceForCLog.cpp) - -set(HEADERS gpu_memcpy_sse4.h - memcpy_sse2.h - Win32InterfaceForCLog.h) - -core_add_library(utils_win32) diff --git a/xbmc/utils/win32/Win32InterfaceForCLog.cpp b/xbmc/utils/win32/Win32InterfaceForCLog.cpp deleted file mode 100644 index 0a5cf9a43f..0000000000 --- a/xbmc/utils/win32/Win32InterfaceForCLog.cpp +++ /dev/null @@ -1,128 +0,0 @@ -/* -* Copyright (C) 2014 Team XBMC -* http://kodi.tv -* -* This Program is free software; you can redistribute it and/or modify -* it under the terms of the GNU General Public License as published by -* the Free Software Foundation; either version 2, or (at your option) -* any later version. -* -* This Program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. -* -* You should have received a copy of the GNU General Public License -* along with XBMC; see the file COPYING. If not, see -* . -* -*/ - -#ifndef TARGET_WINDOWS -#error This file is for win32 platforms only -#endif //!TARGET_WINDOWS - -#include "Win32InterfaceForCLog.h" -#include "platform/win32/WIN32Util.h" -#include "utils/StringUtils.h" -#include "utils/auto_buffer.h" - -#include - -CWin32InterfaceForCLog::CWin32InterfaceForCLog() : - m_hFile(INVALID_HANDLE_VALUE) -{ } - -CWin32InterfaceForCLog::~CWin32InterfaceForCLog() -{ - if (m_hFile != INVALID_HANDLE_VALUE) - CloseHandle(m_hFile); -} - -bool CWin32InterfaceForCLog::OpenLogFile(const std::string& logFilename, const std::string& backupOldLogToFilename) -{ - if (m_hFile != INVALID_HANDLE_VALUE) - return false; // file was already opened - - std::wstring strLogFileW(CWIN32Util::ConvertPathToWin32Form(CWIN32Util::SmbToUnc(logFilename))); - std::wstring strLogFileOldW(CWIN32Util::ConvertPathToWin32Form(CWIN32Util::SmbToUnc(backupOldLogToFilename))); - - if (strLogFileW.empty()) - return false; - - if (!strLogFileOldW.empty()) - { - (void)DeleteFileW(strLogFileOldW.c_str()); // if it's failed, try to continue -#ifdef TARGET_WINDOWS_STORE - (void)MoveFileEx(strLogFileW.c_str(), strLogFileOldW.c_str(), MOVEFILE_REPLACE_EXISTING); // if it's failed, try to continue -#else - (void)MoveFileW(strLogFileW.c_str(), strLogFileOldW.c_str()); // if it's failed, try to continue -#endif - } - -#ifdef TARGET_WINDOWS_STORE - m_hFile = CreateFile2(strLogFileW.c_str(), GENERIC_WRITE, FILE_SHARE_READ, - CREATE_ALWAYS, NULL); -#else - m_hFile = CreateFileW(strLogFileW.c_str(), GENERIC_WRITE, FILE_SHARE_READ, NULL, - CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); -#endif - - if (m_hFile == INVALID_HANDLE_VALUE) - return false; - - static const unsigned char BOM[3] = { 0xEF, 0xBB, 0xBF }; - DWORD written; - (void)WriteFile(m_hFile, BOM, sizeof(BOM), &written, NULL); // write BOM, ignore possible errors - (void)FlushFileBuffers(m_hFile); - - return true; -} - -void CWin32InterfaceForCLog::CloseLogFile(void) -{ - if (m_hFile != INVALID_HANDLE_VALUE) - { - CloseHandle(m_hFile); - m_hFile = INVALID_HANDLE_VALUE; - } -} - -bool CWin32InterfaceForCLog::WriteStringToLog(const std::string& logString) -{ - if (m_hFile == INVALID_HANDLE_VALUE) - return false; - - std::string strData(logString); - StringUtils::Replace(strData, "\n", "\r\n"); - strData += "\r\n"; - - DWORD written; - const bool ret = (WriteFile(m_hFile, strData.c_str(), strData.length(), &written, NULL) != 0) && written == strData.length(); - - return ret; -} - -void CWin32InterfaceForCLog::PrintDebugString(const std::string& debugString) -{ -#ifdef _DEBUG - ::OutputDebugStringW(L"Debug Print: "); - int bufSize = MultiByteToWideChar(CP_UTF8, 0, debugString.c_str(), debugString.length(), NULL, 0); - XUTILS::auto_buffer buf(sizeof(wchar_t) * (bufSize + 1)); // '+1' for extra safety - if (MultiByteToWideChar(CP_UTF8, 0, debugString.c_str(), debugString.length(), (wchar_t*)buf.get(), buf.size() / sizeof(wchar_t)) == bufSize) - ::OutputDebugStringW(std::wstring((wchar_t*)buf.get(), bufSize).c_str()); - else - ::OutputDebugStringA(debugString.c_str()); - ::OutputDebugStringW(L"\n"); -#endif // _DEBUG -} - -void CWin32InterfaceForCLog::GetCurrentLocalTime(int& hour, int& minute, int& second, double& millisecond) -{ - SYSTEMTIME time; - GetLocalTime(&time); - hour = time.wHour; - minute = time.wMinute; - second = time.wSecond; - millisecond = static_cast(time.wMilliseconds); -} diff --git a/xbmc/utils/win32/Win32InterfaceForCLog.h b/xbmc/utils/win32/Win32InterfaceForCLog.h deleted file mode 100644 index 0397c154df..0000000000 --- a/xbmc/utils/win32/Win32InterfaceForCLog.h +++ /dev/null @@ -1,38 +0,0 @@ -#pragma once -/* -* Copyright (C) 2014 Team XBMC -* http://kodi.tv -* -* This Program is free software; you can redistribute it and/or modify -* it under the terms of the GNU General Public License as published by -* the Free Software Foundation; either version 2, or (at your option) -* any later version. -* -* This Program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. -* -* You should have received a copy of the GNU General Public License -* along with XBMC; see the file COPYING. If not, see -* . -* -*/ - -#include - -typedef void* HANDLE; // forward declaration, to avoid inclusion of whole Windows.h - -class CWin32InterfaceForCLog -{ -public: - CWin32InterfaceForCLog(); - ~CWin32InterfaceForCLog(); - bool OpenLogFile(const std::string& logFilename, const std::string& backupOldLogToFilename); - void CloseLogFile(void); - bool WriteStringToLog(const std::string& logString); - void PrintDebugString(const std::string& debugString); - static void GetCurrentLocalTime(int& hour, int& minute, int& second, double& millisecond); -private: - HANDLE m_hFile; -}; diff --git a/xbmc/utils/win32/gpu_memcpy_sse4.h b/xbmc/utils/win32/gpu_memcpy_sse4.h deleted file mode 100644 index b5eb89896b..0000000000 --- a/xbmc/utils/win32/gpu_memcpy_sse4.h +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (C) 2011-2015 Hendrik Leppkes - * http://www.1f0.de - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Taken from the QuickSync decoder by Eric Gur - */ - -#include -#include - -// gpu_memcpy is a memcpy style function that copied data very fast from a -// GPU tiled memory (write back) -// Performance tip: page offset (12 lsb) of both addresses should be different -// optimally use a 2K offset between them. -inline void* gpu_memcpy(void* d, const void* s, size_t size) -{ - static const size_t regsInLoop = sizeof(size_t) * 2; // 8 or 16 - - if (d == nullptr || s == nullptr) return nullptr; - - // If memory is not aligned, use memcpy - bool isAligned = (((size_t)(s) | (size_t)(d)) & 0xF) == 0; - if (!isAligned) - { - return memcpy(d, s, size); - } - - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; -#ifdef _M_X64 - __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; -#endif - - size_t reminder = size & (regsInLoop * sizeof(xmm0) - 1); // Copy 128 or 256 bytes every loop - size_t end = 0; - - __m128i* pTrg = (__m128i*)d; - __m128i* pTrgEnd = pTrg + ((size - reminder) >> 4); - __m128i* pSrc = (__m128i*)s; - - // Make sure source is synced - doesn't hurt if not needed. - _mm_sfence(); - - while (pTrg < pTrgEnd) - { - // _mm_stream_load_si128 emits the Streaming SIMD Extensions 4 (SSE4.1) instruction MOVNTDQA - // Fastest method for copying GPU RAM. Available since Penryn (45nm Core 2 Duo/Quad) - xmm0 = _mm_stream_load_si128(pSrc); - xmm1 = _mm_stream_load_si128(pSrc + 1); - xmm2 = _mm_stream_load_si128(pSrc + 2); - xmm3 = _mm_stream_load_si128(pSrc + 3); - xmm4 = _mm_stream_load_si128(pSrc + 4); - xmm5 = _mm_stream_load_si128(pSrc + 5); - xmm6 = _mm_stream_load_si128(pSrc + 6); - xmm7 = _mm_stream_load_si128(pSrc + 7); -#ifdef _M_X64 // Use all 16 xmm registers - xmm8 = _mm_stream_load_si128(pSrc + 8); - xmm9 = _mm_stream_load_si128(pSrc + 9); - xmm10 = _mm_stream_load_si128(pSrc + 10); - xmm11 = _mm_stream_load_si128(pSrc + 11); - xmm12 = _mm_stream_load_si128(pSrc + 12); - xmm13 = _mm_stream_load_si128(pSrc + 13); - xmm14 = _mm_stream_load_si128(pSrc + 14); - xmm15 = _mm_stream_load_si128(pSrc + 15); -#endif - pSrc += regsInLoop; - // _mm_store_si128 emit the SSE2 instruction MOVDQA (aligned store) - _mm_store_si128(pTrg , xmm0); - _mm_store_si128(pTrg + 1, xmm1); - _mm_store_si128(pTrg + 2, xmm2); - _mm_store_si128(pTrg + 3, xmm3); - _mm_store_si128(pTrg + 4, xmm4); - _mm_store_si128(pTrg + 5, xmm5); - _mm_store_si128(pTrg + 6, xmm6); - _mm_store_si128(pTrg + 7, xmm7); -#ifdef _M_X64 // Use all 16 xmm registers - _mm_store_si128(pTrg + 8, xmm8); - _mm_store_si128(pTrg + 9, xmm9); - _mm_store_si128(pTrg + 10, xmm10); - _mm_store_si128(pTrg + 11, xmm11); - _mm_store_si128(pTrg + 12, xmm12); - _mm_store_si128(pTrg + 13, xmm13); - _mm_store_si128(pTrg + 14, xmm14); - _mm_store_si128(pTrg + 15, xmm15); -#endif - pTrg += regsInLoop; - } - - // Copy in 16 byte steps - if (reminder >= 16) - { - size = reminder; - reminder = size & 15; - end = size >> 4; - for (size_t i = 0; i < end; ++i) - { - pTrg[i] = _mm_stream_load_si128(pSrc + i); - } - } - - // Copy last bytes - shouldn't happen as strides are modulo 16 - if (reminder) - { - __m128i temp = _mm_stream_load_si128(pSrc + end); - - char* ps = (char*)(&temp); - char* pt = (char*)(pTrg + end); - - for (size_t i = 0; i < reminder; ++i) - { - pt[i] = ps[i]; - } - } - - return d; -} \ No newline at end of file diff --git a/xbmc/utils/win32/memcpy_sse2.h b/xbmc/utils/win32/memcpy_sse2.h deleted file mode 100644 index 13ee1ac5b0..0000000000 --- a/xbmc/utils/win32/memcpy_sse2.h +++ /dev/null @@ -1,333 +0,0 @@ -/* - * Copyright (C) 2005-2015 Team Kodi - * http://kodi.tv - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - -#pragma once -#if defined(HAVE_SSE2) -#include -#endif - -inline void* memcpy_aligned(void* dst, const void* src, size_t size, uint8_t bpp = 0) -{ - const uint8_t shift = 16 - bpp; -#if defined(HAVE_SSE2) - __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; -#ifdef _M_X64 - __m128i xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm16; -#endif -#endif - -#if defined(HAVE_SSE2) - // if memory is not aligned, use memcpy - if ((((size_t)(src) | (size_t)(dst)) & 0xF)) -#endif - { - if (bpp == 0 || bpp == 16) - return memcpy(dst, src, size); - else - { - uint16_t * y = (uint16_t*)(src); - uint16_t * d = (uint16_t*)(dst); - for (size_t x = 0; x < (size >> 1); x++) - { - d[x] = y[x] << shift; - } - return dst; - } - } -#if defined(HAVE_SSE2) - static const size_t regsInLoop = sizeof(size_t) * 2; // 8 or 16 - size_t reminder = size & (regsInLoop * sizeof(xmm1) - 1); // Copy 128 or 256 bytes every loop - size_t end = 0; - - __m128i* pTrg = (__m128i*)dst; - __m128i* pTrgEnd = pTrg + ((size - reminder) >> 4); - __m128i* pSrc = (__m128i*)src; - - _mm_sfence(); - - while(pTrg < pTrgEnd) - //for (i = 0; i < size - 63; i += 64) - { - xmm1 = _mm_load_si128(pSrc); - xmm2 = _mm_load_si128(pSrc + 1); - xmm3 = _mm_load_si128(pSrc + 2); - xmm4 = _mm_load_si128(pSrc + 3); - xmm5 = _mm_load_si128(pSrc + 4); - xmm6 = _mm_load_si128(pSrc + 5); - xmm7 = _mm_load_si128(pSrc + 6); - xmm8 = _mm_load_si128(pSrc + 7); -#ifdef _M_X64 // Use all 16 xmm registers - xmm9 = _mm_load_si128(pSrc + 8); - xmm10 = _mm_load_si128(pSrc + 9); - xmm11 = _mm_load_si128(pSrc + 10); - xmm12 = _mm_load_si128(pSrc + 11); - xmm13 = _mm_load_si128(pSrc + 12); - xmm14 = _mm_load_si128(pSrc + 13); - xmm15 = _mm_load_si128(pSrc + 14); - xmm16 = _mm_load_si128(pSrc + 15); -#endif - pSrc += regsInLoop; - - if (bpp != 0 && bpp != 16) - { - xmm1 = _mm_slli_epi16(xmm1, shift); - xmm2 = _mm_slli_epi16(xmm2, shift); - xmm3 = _mm_slli_epi16(xmm3, shift); - xmm4 = _mm_slli_epi16(xmm4, shift); - xmm5 = _mm_slli_epi16(xmm5, shift); - xmm6 = _mm_slli_epi16(xmm6, shift); - xmm7 = _mm_slli_epi16(xmm7, shift); - xmm8 = _mm_slli_epi16(xmm8, shift); -#ifdef _M_X64 // Use all 16 xmm registers - xmm9 = _mm_slli_epi16(xmm9, shift); - xmm10 = _mm_slli_epi16(xmm10, shift); - xmm11 = _mm_slli_epi16(xmm11, shift); - xmm12 = _mm_slli_epi16(xmm12, shift); - xmm13 = _mm_slli_epi16(xmm13, shift); - xmm14 = _mm_slli_epi16(xmm14, shift); - xmm15 = _mm_slli_epi16(xmm15, shift); - xmm16 = _mm_slli_epi16(xmm16, shift); -#endif - } - - _mm_stream_si128(pTrg, xmm1); - _mm_stream_si128(pTrg + 1, xmm2); - _mm_stream_si128(pTrg + 2, xmm3); - _mm_stream_si128(pTrg + 3, xmm4); - _mm_stream_si128(pTrg + 4, xmm5); - _mm_stream_si128(pTrg + 5, xmm6); - _mm_stream_si128(pTrg + 6, xmm7); - _mm_stream_si128(pTrg + 7, xmm8); -#ifdef _M_X64 // Use all 16 xmm registers - _mm_stream_si128(pTrg + 8, xmm9); - _mm_stream_si128(pTrg + 9, xmm10); - _mm_stream_si128(pTrg + 10, xmm11); - _mm_stream_si128(pTrg + 11, xmm12); - _mm_stream_si128(pTrg + 12, xmm13); - _mm_stream_si128(pTrg + 13, xmm14); - _mm_stream_si128(pTrg + 14, xmm15); - _mm_stream_si128(pTrg + 15, xmm16); -#endif - pTrg += regsInLoop; - } - - if (reminder >= 16) - { - size = reminder; - reminder = size & 15; - end = size >> 4; - for (size_t i = 0; i < end; ++i) - { - xmm1 = _mm_load_si128(pSrc + i); - if (bpp != 0 && bpp != 16) - xmm1 = _mm_slli_epi16(xmm1, shift); - _mm_store_si128(pTrg + i, xmm1); - } - } - - if (reminder) - { - __m128i temp = _mm_load_si128(pSrc + end); - char* ps = (char*)(&temp); - char* pt = (char*)(pTrg + end); - for (size_t i = 0; i < reminder; ++i) - { - pt[i] = ps[i] << shift; - } - } - return dst; -#endif -} - -inline void copy_plane(uint8_t *const src, const int srcStride, int height, int width, uint8_t *const dst, const int dstStride, uint8_t bpp = 0) -{ -#if defined(HAVE_SSE2) - _mm_sfence(); -#endif - - if (srcStride == dstStride) - memcpy_aligned(dst, src, srcStride * height, bpp); - else - { - for (size_t line = 0; line < height; ++line) - { - uint8_t * s = src + srcStride * line; - uint8_t * d = dst + dstStride * line; - memcpy_aligned(d, s, srcStride, bpp); - } - } -} - -inline void convert_yuv420_nv12_chrome(uint8_t *const *src, const int *srcStride, int height, int width, uint8_t *const dst, const int dstStride) -{ -#if defined(HAVE_SSE2) - __m128i xmm0, xmm1, xmm2, xmm3, xmm4; - _mm_sfence(); -#endif - - const size_t chroma_width = (width + 1) >> 1; - const size_t chromaHeight = height >> 1; - size_t line, i; - - for (line = 0; line < chromaHeight; ++line) - { - uint8_t * u = src[0] + line * srcStride[0]; - uint8_t * v = src[1] + line * srcStride[1]; - uint8_t * d = dst + line * dstStride; - - // if memory is not aligned use memcpy -#if defined(HAVE_SSE2) - if (((size_t)(u) | (size_t)(v) | (size_t)(d)) & 0xF) -#endif - { - for (i = 0; i < chroma_width; ++i) - { - *d++ = *u++; - *d++ = *v++; - } - } -#if defined(HAVE_SSE2) - else - { - for (i = 0; i < (chroma_width - 31); i += 32) - { - xmm0 = _mm_load_si128((__m128i*)(v + i)); - xmm1 = _mm_load_si128((__m128i*)(u + i)); - xmm2 = _mm_load_si128((__m128i*)(v + i + 16)); - xmm3 = _mm_load_si128((__m128i*)(u + i + 16)); - - xmm4 = xmm0; - xmm0 = _mm_unpacklo_epi8(xmm1, xmm0); - xmm4 = _mm_unpackhi_epi8(xmm1, xmm4); - - xmm1 = xmm2; - xmm2 = _mm_unpacklo_epi8(xmm3, xmm2); - xmm1 = _mm_unpackhi_epi8(xmm3, xmm1); - - _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0); - _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm4); - _mm_stream_si128((__m128i *)(d + (i << 1) + 32), xmm2); - _mm_stream_si128((__m128i *)(d + (i << 1) + 48), xmm1); - } - if (((size_t)chroma_width) & 0xF) - { - d += (i << 1); - u += i; v += i; - for (; i < chroma_width; ++i) - { - *d++ = *u++; - *d++ = *v++; - } - } - else if (i < chroma_width) - { - xmm0 = _mm_load_si128((__m128i*)(v + i)); - xmm1 = _mm_load_si128((__m128i*)(u + i)); - - xmm2 = xmm0; - xmm0 = _mm_unpacklo_epi8(xmm1, xmm0); - xmm2 = _mm_unpackhi_epi8(xmm1, xmm2); - - _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0); - _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm2); - } - } -#endif - } -} - -inline void convert_yuv420_p01x_chrome(uint8_t *const *src, const int *srcStride, int height, int width, uint8_t *const dst, const int dstStride, uint8_t bpp) -{ - const uint8_t shift = 16 - bpp; -#if defined(HAVE_SSE2) - __m128i xmm0, xmm1, xmm2, xmm3, xmm4; - _mm_sfence(); -#endif - - // Convert to P01x - Chroma - const size_t chromaWidth = (width + 1) >> 1; - const size_t chromaHeight = height >> 1; - size_t line, i; - - for (line = 0; line < chromaHeight; ++line) - { - uint16_t * u = (uint16_t*)(src[0] + line * srcStride[0]); - uint16_t * v = (uint16_t*)(src[1] + line * srcStride[1]); - uint16_t * d = (uint16_t*)(dst + line * dstStride); - - // if memory is not aligned use memcpy -#if defined(HAVE_SSE2) - if (((size_t)(u) | (size_t)(v) | (size_t)(d)) & 0xF) -#endif - { - for (i = 0; i < chromaWidth; ++i) - { - *d++ = *u++ << shift; - *d++ = *v++ << shift; - } - } -#if defined(HAVE_SSE2) - else - { - for (i = 0; i < chromaWidth; i += 16) - { - xmm0 = _mm_load_si128((__m128i*)(v + i)); - xmm1 = _mm_load_si128((__m128i*)(u + i)); - xmm2 = _mm_load_si128((__m128i*)(v + i + 8)); - xmm3 = _mm_load_si128((__m128i*)(u + i + 8)); - - xmm0 = _mm_slli_epi16(xmm0, shift); - xmm1 = _mm_slli_epi16(xmm1, shift); - xmm2 = _mm_slli_epi16(xmm2, shift); - xmm3 = _mm_slli_epi16(xmm3, shift); - - xmm4 = xmm0; - xmm0 = _mm_unpacklo_epi16(xmm1, xmm0); - xmm4 = _mm_unpackhi_epi16(xmm1, xmm4); - - xmm1 = xmm2; - xmm2 = _mm_unpacklo_epi16(xmm3, xmm2); - xmm1 = _mm_unpackhi_epi16(xmm3, xmm1); - - _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0); - _mm_stream_si128((__m128i *)(d + (i << 1) + 8), xmm4); - _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm2); - _mm_stream_si128((__m128i *)(d + (i << 1) + 24), xmm1); - } - } -#endif - } -} - -inline void convert_yuv420_nv12(uint8_t *const src[], const int srcStride[], int height, int width, uint8_t *const dst[], const int dstStride[]) -{ - // Convert to NV12 - Luma - copy_plane(src[0], srcStride[0], height, width, dst[0], dstStride[0]); - // Convert to NV12 - Chroma - convert_yuv420_nv12_chrome(&src[1], &srcStride[1], height, width, dst[1], dstStride[1]); -} - -inline void convert_yuv420_p01x(uint8_t *const src[], const int srcStride[], int height, int width, uint8_t *const dst[], const int dstStride[], uint8_t bpp) -{ - // Convert to P01x - Luma - copy_plane(src[0], srcStride[0], height, width, dst[0], dstStride[0], bpp); - // Convert to P01x - Chroma - convert_yuv420_p01x_chrome(&src[1], &srcStride[1], height, width, dst[1], dstStride[1], bpp); -} -- cgit v1.2.3