diff options
author | jmarshallnz <jcmarsha@gmail.com> | 2013-12-09 16:17:31 -0800 |
---|---|---|
committer | jmarshallnz <jcmarsha@gmail.com> | 2013-12-09 16:17:31 -0800 |
commit | e20eee92df9fe2c18c2bf70ba96dba0134d2f58d (patch) | |
tree | c273576d7efc1b70b393527f3b3fbcfd1518ac39 | |
parent | 79aee04c3aa8fb0dd4c05cfde84cd13318fed84a (diff) | |
parent | 7ad9c353a585510896a13fb061efd7ee0c71e607 (diff) |
Merge pull request #3650 from Karlson2k/fix_pcre_utf8
Fix PCRE UTF-8 error (mostly for scrapers)
-rw-r--r-- | xbmc/FileItem.cpp | 6 | ||||
-rw-r--r-- | xbmc/Util.cpp | 6 | ||||
-rw-r--r-- | xbmc/cores/ExternalPlayer/ExternalPlayer.cpp | 2 | ||||
-rw-r--r-- | xbmc/cores/playercorefactory/PlayerSelectionRule.cpp | 2 | ||||
-rw-r--r-- | xbmc/filesystem/StackDirectory.cpp | 2 | ||||
-rw-r--r-- | xbmc/utils/RegExp.cpp | 159 | ||||
-rw-r--r-- | xbmc/utils/RegExp.h | 21 | ||||
-rw-r--r-- | xbmc/utils/ScraperParser.cpp | 14 | ||||
-rw-r--r-- | xbmc/utils/StringUtils.cpp | 22 | ||||
-rw-r--r-- | xbmc/utils/StringUtils.h | 26 | ||||
-rw-r--r-- | xbmc/utils/XBMCTinyXML.cpp | 2 | ||||
-rw-r--r-- | xbmc/video/VideoInfoScanner.cpp | 4 |
12 files changed, 243 insertions, 23 deletions
diff --git a/xbmc/FileItem.cpp b/xbmc/FileItem.cpp index 6a806458ab..73a794983c 100644 --- a/xbmc/FileItem.cpp +++ b/xbmc/FileItem.cpp @@ -2324,7 +2324,7 @@ void CFileItemList::StackFolders() { // Precompile our REs VECCREGEXP folderRegExps; - CRegExp folderRegExp(true, true); + CRegExp folderRegExp(true, CRegExp::autoUtf8); const CStdStringArray& strFolderRegExps = g_advancedSettings.m_folderStackRegExps; CStdStringArray::const_iterator strExpression = strFolderRegExps.begin(); @@ -2416,7 +2416,7 @@ void CFileItemList::StackFiles() { // Precompile our REs VECCREGEXP stackRegExps; - CRegExp tmpRegExp(true, true); + CRegExp tmpRegExp(true, CRegExp::autoUtf8); const CStdStringArray& strStackRegExps = g_advancedSettings.m_videoStackRegExps; CStdStringArray::const_iterator strRegExp = strStackRegExps.begin(); while (strRegExp != strStackRegExps.end()) @@ -3242,7 +3242,7 @@ CStdString CFileItem::FindTrailer() const // Precompile our REs VECCREGEXP matchRegExps; - CRegExp tmpRegExp(true, true); + CRegExp tmpRegExp(true, CRegExp::autoUtf8); const CStdStringArray& strMatchRegExps = g_advancedSettings.m_trailerMatchRegExps; CStdStringArray::const_iterator strRegExp = strMatchRegExps.begin(); diff --git a/xbmc/Util.cpp b/xbmc/Util.cpp index ae6eaa6a0f..68d0188ceb 100644 --- a/xbmc/Util.cpp +++ b/xbmc/Util.cpp @@ -243,8 +243,8 @@ void CUtil::CleanString(const CStdString& strFileName, CStdString& strTitle, CSt const CStdStringArray ®exps = g_advancedSettings.m_videoCleanStringRegExps; - CRegExp reTags(true, true); - CRegExp reYear(false, true); + CRegExp reTags(true, CRegExp::autoUtf8); + CRegExp reYear(false, CRegExp::autoUtf8); if (!reYear.RegComp(g_advancedSettings.m_videoCleanDateTimeRegExp)) { @@ -519,7 +519,7 @@ bool CUtil::ExcludeFileOrFolder(const CStdString& strFileOrFolder, const CStdStr if (strFileOrFolder.empty()) return false; - CRegExp regExExcludes(true, true); // case insensitive regex + CRegExp regExExcludes(true, CRegExp::autoUtf8); // case insensitive regex for (unsigned int i = 0; i < regexps.size(); i++) { diff --git a/xbmc/cores/ExternalPlayer/ExternalPlayer.cpp b/xbmc/cores/ExternalPlayer/ExternalPlayer.cpp index 7b72a191a8..4301a23214 100644 --- a/xbmc/cores/ExternalPlayer/ExternalPlayer.cpp +++ b/xbmc/cores/ExternalPlayer/ExternalPlayer.cpp @@ -185,7 +185,7 @@ void CExternalPlayer::Process() CStdString strMatch = vecSplit[0]; StringUtils::Replace(strMatch, ",,",","); bool bCaseless = vecSplit[3].find('i') != std::string::npos; - CRegExp regExp(bCaseless, true); + CRegExp regExp(bCaseless, CRegExp::autoUtf8); if (!regExp.RegComp(strMatch.c_str())) { // invalid regexp - complain in logs diff --git a/xbmc/cores/playercorefactory/PlayerSelectionRule.cpp b/xbmc/cores/playercorefactory/PlayerSelectionRule.cpp index a0add7c989..63a0125a3e 100644 --- a/xbmc/cores/playercorefactory/PlayerSelectionRule.cpp +++ b/xbmc/cores/playercorefactory/PlayerSelectionRule.cpp @@ -118,7 +118,7 @@ void CPlayerSelectionRule::GetPlayers(const CFileItem& item, VECPLAYERCORES &vec if (m_tDVDFile >= 0 && (m_tDVDFile > 0) != item.IsDVDFile()) return; if (m_tDVDImage >= 0 && (m_tDVDImage > 0) != item.IsDVDImage()) return; - CRegExp regExp(false, true); + CRegExp regExp(false, CRegExp::autoUtf8); if (m_bStreamDetails) { diff --git a/xbmc/filesystem/StackDirectory.cpp b/xbmc/filesystem/StackDirectory.cpp index 8b06ce8864..3d1bbacd20 100644 --- a/xbmc/filesystem/StackDirectory.cpp +++ b/xbmc/filesystem/StackDirectory.cpp @@ -59,7 +59,7 @@ namespace XFILE { // Load up our REs VECCREGEXP RegExps; - CRegExp tempRE(true, true); + CRegExp tempRE(true, CRegExp::autoUtf8); const CStdStringArray& strRegExps = g_advancedSettings.m_videoStackRegExps; CStdStringArray::const_iterator itRegExp = strRegExps.begin(); vector<pair<int, CStdString> > badStacks; diff --git a/xbmc/utils/RegExp.cpp b/xbmc/utils/RegExp.cpp index 5afa971c47..ee2f462689 100644 --- a/xbmc/utils/RegExp.cpp +++ b/xbmc/utils/RegExp.cpp @@ -53,19 +53,20 @@ int CRegExp::m_UcpSupported = -1; int CRegExp::m_JitSupported = -1; -CRegExp::CRegExp(bool caseless /*= false*/, bool utf8 /*= false*/) +CRegExp::CRegExp(bool caseless /*= false*/, CRegExp::utf8Mode utf8 /*= asciiOnly*/) { InitValues(caseless, utf8); } -void CRegExp::InitValues(bool caseless /*= false*/, bool utf8 /*= false*/) +void CRegExp::InitValues(bool caseless /*= false*/, CRegExp::utf8Mode utf8 /*= asciiOnly*/) { + m_utf8Mode = utf8; m_re = NULL; m_sd = NULL; m_iOptions = PCRE_DOTALL | PCRE_NEWLINE_ANY; if(caseless) m_iOptions |= PCRE_CASELESS; - if (utf8) + if (m_utf8Mode == forceUtf8) { if (IsUtf8Supported()) m_iOptions |= PCRE_UTF8; @@ -82,17 +83,162 @@ void CRegExp::InitValues(bool caseless /*= false*/, bool utf8 /*= false*/) memset(m_iOvector, 0, sizeof(m_iOvector)); } -CRegExp::CRegExp(bool caseless, bool utf8, const char *re, studyMode study /*= NoStudy*/) +CRegExp::CRegExp(bool caseless, CRegExp::utf8Mode utf8, const char *re, studyMode study /*= NoStudy*/) { + if (utf8 == autoUtf8) + utf8 = requireUtf8(re) ? forceUtf8 : asciiOnly; + InitValues(caseless, utf8); RegComp(re, study); } +bool CRegExp::requireUtf8(const std::string& regexp) +{ + // enable UTF-8 mode if regexp string has UTF-8 multibyte sequences + if (CUtf8Utils::checkStrForUtf8(regexp) == CUtf8Utils::utf8string) + return true; + + // check for explicit Unicode Properties (\p, \P, \X) and for Unicode character codes (greater than 0xFF) in form \x{hhh..} + // note: PCRE change meaning of \w, \s, \d (and \W, \S, \D) when Unicode Properties are enabled, + // but in auto mode we enable UNP for US-ASCII regexp only if regexp contains explicit \p, \P, \X or Unicode character code + const char* const regexpC = regexp.c_str(); + const size_t len = regexp.length(); + size_t pos = 0; + + while (pos < len) + { + const char chr = regexpC[pos]; + if (chr == '\\') + { + const char nextChr = regexpC[pos + 1]; + + if (nextChr == 'p' || nextChr == 'P' || nextChr == 'X') + return true; // found Unicode Properties + else if (nextChr == 'Q') + pos = regexp.find("\\E", pos + 2); // skip all literals in "\Q...\E" + else if (nextChr == 'x' && regexpC[pos + 2] == '{') + { // Unicode character with hex code + if (readCharXCode(regexp, pos) >= 0x100) + return true; // found Unicode character code + } + else if (nextChr == '\\' || nextChr == '(' || nextChr == ')' + || nextChr == '[' || nextChr == ']') + pos++; // exclude next character from analyze + + } // chr != '\\' + else if (chr == '(' && regexpC[pos + 1] == '?' && regexpC[pos + 2] == '#') // comment in regexp + pos = regexp.find(')', pos); // skip comment + else if (chr == '[') + { + if (isCharClassWithUnicode(regexp, pos)) + return true; + } + + if (pos == std::string::npos) // check results of regexp.find() and isCharClassWithUnicode + return false; + + pos++; + } + + // no Unicode Properties was found + return false; +} + +inline int CRegExp::readCharXCode(const std::string& regexp, size_t& pos) +{ + // read hex character code in form "\x{hh..}" + // 'pos' must point to '\' + if (pos >= regexp.length()) + return -1; + const char* const regexpC = regexp.c_str(); + if (regexpC[pos] != '\\' || regexpC[pos + 1] != 'x' || regexpC[pos + 2] != '{') + return -1; + + pos++; + const size_t startPos = pos; // 'startPos' points to 'x' + const size_t closingBracketPos = regexp.find('}', startPos + 2); + if (closingBracketPos == std::string::npos) + return 0; // return character zero code, leave 'pos' at 'x' + + pos++; // 'pos' points to '{' + int chCode = 0; + while (++pos < closingBracketPos) + { + const int xdigitVal = StringUtils::asciixdigitvalue(regexpC[pos]); + if (xdigitVal >= 0) + chCode = chCode * 16 + xdigitVal; + else + { // found non-hexdigit + pos = startPos; // reset 'pos' to 'startPos', process "{hh..}" as non-code + return 0; // return character zero code + } + } + + return chCode; +} + +bool CRegExp::isCharClassWithUnicode(const std::string& regexp, size_t& pos) +{ + const char* const regexpC = regexp.c_str(); + const size_t len = regexp.length(); + if (pos > len || regexpC[pos] != '[') + return false; + + // look for Unicode character code "\x{hhh..}" and Unicode properties "\P", "\p" and "\X" + // find end (terminating ']') of character class (like "[a-h45]") + // detect nested POSIX classes like "[[:lower:]]" and escaped brackets like "[\]]" + bool needUnicode = false; + while (++pos < len) + { + if (regexpC[pos] == '[' && regexpC[pos + 1] == ':') + { // possible POSIX character class, like "[:alpha:]" + const size_t nextClosingBracketPos = regexp.find(']', pos + 2); // don't care about "\]", as it produce error if used inside POSIX char class + + if (nextClosingBracketPos == std::string::npos) + { // error in regexp: no closing ']' for character class + pos = std::string::npos; + return needUnicode; + } + else if (regexpC[nextClosingBracketPos - 1] == ':') + pos = nextClosingBracketPos; // skip POSIX character class + // if ":]" is not found, process "[:..." as part of normal character class + } + else if (regexpC[pos] == ']') + return needUnicode; // end of character class + else if (regexpC[pos] == '\\') + { + const char nextChar = regexpC[pos + 1]; + if (nextChar == ']' || nextChar == '[') + pos++; // skip next character + else if (nextChar == 'Q') + { + pos = regexp.find("\\E", pos + 2); + if (pos == std::string::npos) + return needUnicode; // error in regexp: no closing "\E" after "\Q" in character class + else + pos++; // skip "\E" + } + else if (nextChar == 'p' || nextChar == 'P' || nextChar == 'X') + needUnicode = true; // don't care about property name as it can contain only ASCII chars + else if (nextChar == 'x') + { + if (readCharXCode(regexp, pos) >= 0x100) + needUnicode = true; + } + } + } + pos = std::string::npos; // closing square bracket was not found + + return needUnicode; +} + + CRegExp::CRegExp(const CRegExp& re) { m_re = NULL; m_sd = NULL; m_jitStack = NULL; + m_utf8Mode = re.m_utf8Mode; m_iOptions = re.m_iOptions; *this = re; } @@ -140,10 +286,13 @@ bool CRegExp::RegComp(const char *re, studyMode study /*= NoStudy*/) m_iMatchCount = 0; const char *errMsg = NULL; int errOffset = 0; + int options = m_iOptions; + if (m_utf8Mode == autoUtf8 && requireUtf8(re)) + options |= (IsUtf8Supported() ? PCRE_UTF8 : 0) | (AreUnicodePropertiesSupported() ? PCRE_UCP : 0); Cleanup(); - m_re = pcre_compile(re, m_iOptions, &errMsg, &errOffset, NULL); + m_re = pcre_compile(re, options, &errMsg, &errOffset, NULL); if (!m_re) { m_pattern.clear(); diff --git a/xbmc/utils/RegExp.h b/xbmc/utils/RegExp.h index d23166995b..de1ce287a6 100644 --- a/xbmc/utils/RegExp.h +++ b/xbmc/utils/RegExp.h @@ -48,25 +48,32 @@ public: StudyRegExp = 1, // study expression (slower compilation, faster find) StudyWithJitComp // study expression and JIT-compile it, if possible (heavyweight optimization) }; + enum utf8Mode + { + autoUtf8 = -1, // analyze regexp for UTF-8 multi-byte chars, for Unicode codes > 0xFF + // or explicit Unicode properties (\p, \P and \X), enable UTF-8 mode if any of them are found + asciiOnly = 0, // process regexp and strings as single-byte encoded strings + forceUtf8 = 1 // enable UTF-8 mode (with Unicode properties) + }; static const int m_MaxNumOfBackrefrences = 20; /** * @param caseless (optional) Matching will be case insensitive if set to true * or case sensitive if set to false - * @param utf8 (optional) If set to true all string will be processed as UTF-8 strings + * @param utf8 (optional) Control UTF-8 processing */ - CRegExp(bool caseless = false, bool utf8 = false); + CRegExp(bool caseless = false, utf8Mode utf8 = asciiOnly); /** * Create new CRegExp object and compile regexp expression in one step * @warning Use only with hardcoded regexp when you're sure that regexp is compiled without errors * @param caseless Matching will be case insensitive if set to true * or case sensitive if set to false - * @param utf8 If set to true all string will be processed as UTF-8 strings + * @param utf8 Control UTF-8 processing * @param re The regular expression * @param study (optional) Controls study of expression, useful if expression will be used * several times */ - CRegExp(bool caseless, bool utf8, const char *re, studyMode study = NoStudy); + CRegExp(bool caseless, utf8Mode utf8, const char *re, studyMode study = NoStudy); CRegExp(const CRegExp& re); ~CRegExp(); @@ -143,7 +150,10 @@ public: private: int PrivateRegFind(size_t bufferLen, const char *str, unsigned int startoffset = 0, int maxNumberOfCharsToTest = -1); - void InitValues(bool caseless = false, bool utf8 = false); + void InitValues(bool caseless = false, CRegExp::utf8Mode utf8 = asciiOnly); + static bool requireUtf8(const std::string& regexp); + static int readCharXCode(const std::string& regexp, size_t& pos); + static bool isCharClassWithUnicode(const std::string& regexp, size_t& pos); void Cleanup(); inline bool IsValidSubNumber(int iSub) const; @@ -153,6 +163,7 @@ private: static const int OVECCOUNT=(m_MaxNumOfBackrefrences + 1) * 3; unsigned int m_offset; int m_iOvector[OVECCOUNT]; + utf8Mode m_utf8Mode; int m_iMatchCount; int m_iOptions; bool m_jitCompiled; diff --git a/xbmc/utils/ScraperParser.cpp b/xbmc/utils/ScraperParser.cpp index e424fcc467..941e9ed69e 100644 --- a/xbmc/utils/ScraperParser.cpp +++ b/xbmc/utils/ScraperParser.cpp @@ -204,7 +204,19 @@ void CScraperParser::ParseExpression(const CStdString& input, CStdString& dest, if (stricmp(sensitive,"yes") == 0) bInsensitive=false; // match case sensitive - CRegExp reg(bInsensitive, true); + CRegExp::utf8Mode eUtf8 = CRegExp::autoUtf8; + const char* const strUtf8 = pExpression->Attribute("utf8"); + if (strUtf8) + { + if (stricmp(strUtf8, "yes") == 0) + eUtf8 = CRegExp::forceUtf8; + else if (stricmp(strUtf8, "no") == 0) + eUtf8 = CRegExp::asciiOnly; + else if (stricmp(strUtf8, "auto") == 0) + eUtf8 = CRegExp::autoUtf8; + } + + CRegExp reg(bInsensitive, eUtf8); CStdString strExpression; if (pExpression->FirstChild()) strExpression = pExpression->FirstChild()->Value(); diff --git a/xbmc/utils/StringUtils.cpp b/xbmc/utils/StringUtils.cpp index ca84fdd9d7..33d298b184 100644 --- a/xbmc/utils/StringUtils.cpp +++ b/xbmc/utils/StringUtils.cpp @@ -733,6 +733,28 @@ bool StringUtils::IsInteger(const CStdString& str) return i == str.size() && n > 0; } +int StringUtils::asciidigitvalue(char chr) +{ + if (!isasciidigit(chr)) + return -1; + + return chr - '0'; +} + +int StringUtils::asciixdigitvalue(char chr) +{ + int v = asciidigitvalue(chr); + if (v >= 0) + return v; + if (chr >= 'a' && chr <= 'f') + return chr - 'a' + 10; + if (chr >= 'A' && chr <= 'F') + return chr - 'A' + 10; + + return -1; +} + + void StringUtils::RemoveCRLF(CStdString& strLine) { StringUtils::TrimRight(strLine, "\n\r"); diff --git a/xbmc/utils/StringUtils.h b/xbmc/utils/StringUtils.h index 4f5d891317..54c835abe0 100644 --- a/xbmc/utils/StringUtils.h +++ b/xbmc/utils/StringUtils.h @@ -125,6 +125,32 @@ public: \return true if the string is an integer, false otherwise. */ static bool IsInteger(const CStdString& str); + + /* The next several isasciiXX and asciiXXvalue functions are locale independent (US-ASCII only), + * as opposed to standard ::isXX (::isalpha, ::isdigit...) which are locale dependent. + * Next functions get parameter as char and don't need double cast ((int)(unsigned char) is required for standard functions). */ + inline static bool isasciidigit(char chr) // locale independent + { + return chr >= '0' && chr <= '9'; + } + inline static bool isasciixdigit(char chr) // locale independent + { + return (chr >= '0' && chr <= '9') || (chr >= 'a' && chr <= 'f') || (chr >= 'A' && chr <= 'F'); + } + static int asciidigitvalue(char chr); // locale independent + static int asciixdigitvalue(char chr); // locale independent + inline static bool isasciiuppercaseletter(char chr) // locale independent + { + return (chr >= 'A' && chr <= 'Z'); + } + inline static bool isasciilowercaseletter(char chr) // locale independent + { + return (chr >= 'a' && chr <= 'z'); + } + inline static bool isasciialphanum(char chr) // locale independent + { + return isasciiuppercaseletter(chr) || isasciilowercaseletter(chr) || isasciidigit(chr); + } static CStdString SizeToString(int64_t size); static const CStdString EmptyString; static const std::string Empty; diff --git a/xbmc/utils/XBMCTinyXML.cpp b/xbmc/utils/XBMCTinyXML.cpp index 710171079a..e29f8e58ed 100644 --- a/xbmc/utils/XBMCTinyXML.cpp +++ b/xbmc/utils/XBMCTinyXML.cpp @@ -212,7 +212,7 @@ bool CXBMCTinyXML::InternalParse(const std::string& rawdata, TiXmlEncoding encod return (TiXmlDocument::Parse(rawdata.c_str(), NULL, encoding) != NULL); // nothing to fix, process data directly std::string data(rawdata); - CRegExp re(false, false, "^&(amp|lt|gt|quot|apos|#x[a-fA-F0-9]{1,4}|#[0-9]{1,5});.*"); + CRegExp re(false, CRegExp::asciiOnly, "^&(amp|lt|gt|quot|apos|#x[a-fA-F0-9]{1,4}|#[0-9]{1,5});.*"); do { if (re.RegFind(data, pos, MAX_ENTITY_LENGTH) < 0) diff --git a/xbmc/video/VideoInfoScanner.cpp b/xbmc/video/VideoInfoScanner.cpp index 4e2780a12b..62569cb9d9 100644 --- a/xbmc/video/VideoInfoScanner.cpp +++ b/xbmc/video/VideoInfoScanner.cpp @@ -870,7 +870,7 @@ namespace VIDEO for (unsigned int i=0;i<expression.size();++i) { - CRegExp reg(true, true); + CRegExp reg(true, CRegExp::autoUtf8); if (!reg.RegComp(expression[i].regexp)) continue; @@ -939,7 +939,7 @@ namespace VIDEO // add what we found by now episodeList.push_back(episode); - CRegExp reg2(true, true); + CRegExp reg2(true, CRegExp::autoUtf8); // check the remainder of the string for any further episodes. if (!byDate && reg2.RegComp(g_advancedSettings.m_tvshowMultiPartEnumRegExp)) { |