diff options
author | Karlson2k <k2k@narod.ru> | 2013-11-14 22:03:06 +0400 |
---|---|---|
committer | Karlson2k <k2k@narod.ru> | 2013-12-09 00:34:10 +0400 |
commit | d95792fd0de12a8e60f8a1a3b267a139b442a65c (patch) | |
tree | 06e0dbef5a14ae7bc60e72de4b69a4d4eec1d98d | |
parent | 7d057d7c9f145f39bf1be8bc68619a50f72deadf (diff) |
RegExp: add automatic UTF-8 mode based on given regular expression
-rw-r--r-- | xbmc/utils/RegExp.cpp | 159 | ||||
-rw-r--r-- | xbmc/utils/RegExp.h | 21 |
2 files changed, 170 insertions, 10 deletions
diff --git a/xbmc/utils/RegExp.cpp b/xbmc/utils/RegExp.cpp index 5afa971c47..ee2f462689 100644 --- a/xbmc/utils/RegExp.cpp +++ b/xbmc/utils/RegExp.cpp @@ -53,19 +53,20 @@ int CRegExp::m_UcpSupported = -1; int CRegExp::m_JitSupported = -1; -CRegExp::CRegExp(bool caseless /*= false*/, bool utf8 /*= false*/) +CRegExp::CRegExp(bool caseless /*= false*/, CRegExp::utf8Mode utf8 /*= asciiOnly*/) { InitValues(caseless, utf8); } -void CRegExp::InitValues(bool caseless /*= false*/, bool utf8 /*= false*/) +void CRegExp::InitValues(bool caseless /*= false*/, CRegExp::utf8Mode utf8 /*= asciiOnly*/) { + m_utf8Mode = utf8; m_re = NULL; m_sd = NULL; m_iOptions = PCRE_DOTALL | PCRE_NEWLINE_ANY; if(caseless) m_iOptions |= PCRE_CASELESS; - if (utf8) + if (m_utf8Mode == forceUtf8) { if (IsUtf8Supported()) m_iOptions |= PCRE_UTF8; @@ -82,17 +83,162 @@ void CRegExp::InitValues(bool caseless /*= false*/, bool utf8 /*= false*/) memset(m_iOvector, 0, sizeof(m_iOvector)); } -CRegExp::CRegExp(bool caseless, bool utf8, const char *re, studyMode study /*= NoStudy*/) +CRegExp::CRegExp(bool caseless, CRegExp::utf8Mode utf8, const char *re, studyMode study /*= NoStudy*/) { + if (utf8 == autoUtf8) + utf8 = requireUtf8(re) ? forceUtf8 : asciiOnly; + InitValues(caseless, utf8); RegComp(re, study); } +bool CRegExp::requireUtf8(const std::string& regexp) +{ + // enable UTF-8 mode if regexp string has UTF-8 multibyte sequences + if (CUtf8Utils::checkStrForUtf8(regexp) == CUtf8Utils::utf8string) + return true; + + // check for explicit Unicode Properties (\p, \P, \X) and for Unicode character codes (greater than 0xFF) in form \x{hhh..} + // note: PCRE change meaning of \w, \s, \d (and \W, \S, \D) when Unicode Properties are enabled, + // but in auto mode we enable UNP for US-ASCII regexp only if regexp contains explicit \p, \P, \X or Unicode character code + const char* const regexpC = regexp.c_str(); + const size_t len = regexp.length(); + size_t pos = 0; + + while (pos < len) + { + const char chr = regexpC[pos]; + if (chr == '\\') + { + const char nextChr = regexpC[pos + 1]; + + if (nextChr == 'p' || nextChr == 'P' || nextChr == 'X') + return true; // found Unicode Properties + else if (nextChr == 'Q') + pos = regexp.find("\\E", pos + 2); // skip all literals in "\Q...\E" + else if (nextChr == 'x' && regexpC[pos + 2] == '{') + { // Unicode character with hex code + if (readCharXCode(regexp, pos) >= 0x100) + return true; // found Unicode character code + } + else if (nextChr == '\\' || nextChr == '(' || nextChr == ')' + || nextChr == '[' || nextChr == ']') + pos++; // exclude next character from analyze + + } // chr != '\\' + else if (chr == '(' && regexpC[pos + 1] == '?' && regexpC[pos + 2] == '#') // comment in regexp + pos = regexp.find(')', pos); // skip comment + else if (chr == '[') + { + if (isCharClassWithUnicode(regexp, pos)) + return true; + } + + if (pos == std::string::npos) // check results of regexp.find() and isCharClassWithUnicode + return false; + + pos++; + } + + // no Unicode Properties was found + return false; +} + +inline int CRegExp::readCharXCode(const std::string& regexp, size_t& pos) +{ + // read hex character code in form "\x{hh..}" + // 'pos' must point to '\' + if (pos >= regexp.length()) + return -1; + const char* const regexpC = regexp.c_str(); + if (regexpC[pos] != '\\' || regexpC[pos + 1] != 'x' || regexpC[pos + 2] != '{') + return -1; + + pos++; + const size_t startPos = pos; // 'startPos' points to 'x' + const size_t closingBracketPos = regexp.find('}', startPos + 2); + if (closingBracketPos == std::string::npos) + return 0; // return character zero code, leave 'pos' at 'x' + + pos++; // 'pos' points to '{' + int chCode = 0; + while (++pos < closingBracketPos) + { + const int xdigitVal = StringUtils::asciixdigitvalue(regexpC[pos]); + if (xdigitVal >= 0) + chCode = chCode * 16 + xdigitVal; + else + { // found non-hexdigit + pos = startPos; // reset 'pos' to 'startPos', process "{hh..}" as non-code + return 0; // return character zero code + } + } + + return chCode; +} + +bool CRegExp::isCharClassWithUnicode(const std::string& regexp, size_t& pos) +{ + const char* const regexpC = regexp.c_str(); + const size_t len = regexp.length(); + if (pos > len || regexpC[pos] != '[') + return false; + + // look for Unicode character code "\x{hhh..}" and Unicode properties "\P", "\p" and "\X" + // find end (terminating ']') of character class (like "[a-h45]") + // detect nested POSIX classes like "[[:lower:]]" and escaped brackets like "[\]]" + bool needUnicode = false; + while (++pos < len) + { + if (regexpC[pos] == '[' && regexpC[pos + 1] == ':') + { // possible POSIX character class, like "[:alpha:]" + const size_t nextClosingBracketPos = regexp.find(']', pos + 2); // don't care about "\]", as it produce error if used inside POSIX char class + + if (nextClosingBracketPos == std::string::npos) + { // error in regexp: no closing ']' for character class + pos = std::string::npos; + return needUnicode; + } + else if (regexpC[nextClosingBracketPos - 1] == ':') + pos = nextClosingBracketPos; // skip POSIX character class + // if ":]" is not found, process "[:..." as part of normal character class + } + else if (regexpC[pos] == ']') + return needUnicode; // end of character class + else if (regexpC[pos] == '\\') + { + const char nextChar = regexpC[pos + 1]; + if (nextChar == ']' || nextChar == '[') + pos++; // skip next character + else if (nextChar == 'Q') + { + pos = regexp.find("\\E", pos + 2); + if (pos == std::string::npos) + return needUnicode; // error in regexp: no closing "\E" after "\Q" in character class + else + pos++; // skip "\E" + } + else if (nextChar == 'p' || nextChar == 'P' || nextChar == 'X') + needUnicode = true; // don't care about property name as it can contain only ASCII chars + else if (nextChar == 'x') + { + if (readCharXCode(regexp, pos) >= 0x100) + needUnicode = true; + } + } + } + pos = std::string::npos; // closing square bracket was not found + + return needUnicode; +} + + CRegExp::CRegExp(const CRegExp& re) { m_re = NULL; m_sd = NULL; m_jitStack = NULL; + m_utf8Mode = re.m_utf8Mode; m_iOptions = re.m_iOptions; *this = re; } @@ -140,10 +286,13 @@ bool CRegExp::RegComp(const char *re, studyMode study /*= NoStudy*/) m_iMatchCount = 0; const char *errMsg = NULL; int errOffset = 0; + int options = m_iOptions; + if (m_utf8Mode == autoUtf8 && requireUtf8(re)) + options |= (IsUtf8Supported() ? PCRE_UTF8 : 0) | (AreUnicodePropertiesSupported() ? PCRE_UCP : 0); Cleanup(); - m_re = pcre_compile(re, m_iOptions, &errMsg, &errOffset, NULL); + m_re = pcre_compile(re, options, &errMsg, &errOffset, NULL); if (!m_re) { m_pattern.clear(); diff --git a/xbmc/utils/RegExp.h b/xbmc/utils/RegExp.h index d23166995b..de1ce287a6 100644 --- a/xbmc/utils/RegExp.h +++ b/xbmc/utils/RegExp.h @@ -48,25 +48,32 @@ public: StudyRegExp = 1, // study expression (slower compilation, faster find) StudyWithJitComp // study expression and JIT-compile it, if possible (heavyweight optimization) }; + enum utf8Mode + { + autoUtf8 = -1, // analyze regexp for UTF-8 multi-byte chars, for Unicode codes > 0xFF + // or explicit Unicode properties (\p, \P and \X), enable UTF-8 mode if any of them are found + asciiOnly = 0, // process regexp and strings as single-byte encoded strings + forceUtf8 = 1 // enable UTF-8 mode (with Unicode properties) + }; static const int m_MaxNumOfBackrefrences = 20; /** * @param caseless (optional) Matching will be case insensitive if set to true * or case sensitive if set to false - * @param utf8 (optional) If set to true all string will be processed as UTF-8 strings + * @param utf8 (optional) Control UTF-8 processing */ - CRegExp(bool caseless = false, bool utf8 = false); + CRegExp(bool caseless = false, utf8Mode utf8 = asciiOnly); /** * Create new CRegExp object and compile regexp expression in one step * @warning Use only with hardcoded regexp when you're sure that regexp is compiled without errors * @param caseless Matching will be case insensitive if set to true * or case sensitive if set to false - * @param utf8 If set to true all string will be processed as UTF-8 strings + * @param utf8 Control UTF-8 processing * @param re The regular expression * @param study (optional) Controls study of expression, useful if expression will be used * several times */ - CRegExp(bool caseless, bool utf8, const char *re, studyMode study = NoStudy); + CRegExp(bool caseless, utf8Mode utf8, const char *re, studyMode study = NoStudy); CRegExp(const CRegExp& re); ~CRegExp(); @@ -143,7 +150,10 @@ public: private: int PrivateRegFind(size_t bufferLen, const char *str, unsigned int startoffset = 0, int maxNumberOfCharsToTest = -1); - void InitValues(bool caseless = false, bool utf8 = false); + void InitValues(bool caseless = false, CRegExp::utf8Mode utf8 = asciiOnly); + static bool requireUtf8(const std::string& regexp); + static int readCharXCode(const std::string& regexp, size_t& pos); + static bool isCharClassWithUnicode(const std::string& regexp, size_t& pos); void Cleanup(); inline bool IsValidSubNumber(int iSub) const; @@ -153,6 +163,7 @@ private: static const int OVECCOUNT=(m_MaxNumOfBackrefrences + 1) * 3; unsigned int m_offset; int m_iOvector[OVECCOUNT]; + utf8Mode m_utf8Mode; int m_iMatchCount; int m_iOptions; bool m_jitCompiled; |