aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarlson2k <k2k@narod.ru>2013-11-14 22:03:06 +0400
committerKarlson2k <k2k@narod.ru>2013-12-09 00:34:10 +0400
commitd95792fd0de12a8e60f8a1a3b267a139b442a65c (patch)
tree06e0dbef5a14ae7bc60e72de4b69a4d4eec1d98d
parent7d057d7c9f145f39bf1be8bc68619a50f72deadf (diff)
RegExp: add automatic UTF-8 mode based on given regular expression
-rw-r--r--xbmc/utils/RegExp.cpp159
-rw-r--r--xbmc/utils/RegExp.h21
2 files changed, 170 insertions, 10 deletions
diff --git a/xbmc/utils/RegExp.cpp b/xbmc/utils/RegExp.cpp
index 5afa971c47..ee2f462689 100644
--- a/xbmc/utils/RegExp.cpp
+++ b/xbmc/utils/RegExp.cpp
@@ -53,19 +53,20 @@ int CRegExp::m_UcpSupported = -1;
int CRegExp::m_JitSupported = -1;
-CRegExp::CRegExp(bool caseless /*= false*/, bool utf8 /*= false*/)
+CRegExp::CRegExp(bool caseless /*= false*/, CRegExp::utf8Mode utf8 /*= asciiOnly*/)
{
InitValues(caseless, utf8);
}
-void CRegExp::InitValues(bool caseless /*= false*/, bool utf8 /*= false*/)
+void CRegExp::InitValues(bool caseless /*= false*/, CRegExp::utf8Mode utf8 /*= asciiOnly*/)
{
+ m_utf8Mode = utf8;
m_re = NULL;
m_sd = NULL;
m_iOptions = PCRE_DOTALL | PCRE_NEWLINE_ANY;
if(caseless)
m_iOptions |= PCRE_CASELESS;
- if (utf8)
+ if (m_utf8Mode == forceUtf8)
{
if (IsUtf8Supported())
m_iOptions |= PCRE_UTF8;
@@ -82,17 +83,162 @@ void CRegExp::InitValues(bool caseless /*= false*/, bool utf8 /*= false*/)
memset(m_iOvector, 0, sizeof(m_iOvector));
}
-CRegExp::CRegExp(bool caseless, bool utf8, const char *re, studyMode study /*= NoStudy*/)
+CRegExp::CRegExp(bool caseless, CRegExp::utf8Mode utf8, const char *re, studyMode study /*= NoStudy*/)
{
+ if (utf8 == autoUtf8)
+ utf8 = requireUtf8(re) ? forceUtf8 : asciiOnly;
+
InitValues(caseless, utf8);
RegComp(re, study);
}
+bool CRegExp::requireUtf8(const std::string& regexp)
+{
+ // enable UTF-8 mode if regexp string has UTF-8 multibyte sequences
+ if (CUtf8Utils::checkStrForUtf8(regexp) == CUtf8Utils::utf8string)
+ return true;
+
+ // check for explicit Unicode Properties (\p, \P, \X) and for Unicode character codes (greater than 0xFF) in form \x{hhh..}
+ // note: PCRE change meaning of \w, \s, \d (and \W, \S, \D) when Unicode Properties are enabled,
+ // but in auto mode we enable UNP for US-ASCII regexp only if regexp contains explicit \p, \P, \X or Unicode character code
+ const char* const regexpC = regexp.c_str();
+ const size_t len = regexp.length();
+ size_t pos = 0;
+
+ while (pos < len)
+ {
+ const char chr = regexpC[pos];
+ if (chr == '\\')
+ {
+ const char nextChr = regexpC[pos + 1];
+
+ if (nextChr == 'p' || nextChr == 'P' || nextChr == 'X')
+ return true; // found Unicode Properties
+ else if (nextChr == 'Q')
+ pos = regexp.find("\\E", pos + 2); // skip all literals in "\Q...\E"
+ else if (nextChr == 'x' && regexpC[pos + 2] == '{')
+ { // Unicode character with hex code
+ if (readCharXCode(regexp, pos) >= 0x100)
+ return true; // found Unicode character code
+ }
+ else if (nextChr == '\\' || nextChr == '(' || nextChr == ')'
+ || nextChr == '[' || nextChr == ']')
+ pos++; // exclude next character from analyze
+
+ } // chr != '\\'
+ else if (chr == '(' && regexpC[pos + 1] == '?' && regexpC[pos + 2] == '#') // comment in regexp
+ pos = regexp.find(')', pos); // skip comment
+ else if (chr == '[')
+ {
+ if (isCharClassWithUnicode(regexp, pos))
+ return true;
+ }
+
+ if (pos == std::string::npos) // check results of regexp.find() and isCharClassWithUnicode
+ return false;
+
+ pos++;
+ }
+
+ // no Unicode Properties was found
+ return false;
+}
+
+inline int CRegExp::readCharXCode(const std::string& regexp, size_t& pos)
+{
+ // read hex character code in form "\x{hh..}"
+ // 'pos' must point to '\'
+ if (pos >= regexp.length())
+ return -1;
+ const char* const regexpC = regexp.c_str();
+ if (regexpC[pos] != '\\' || regexpC[pos + 1] != 'x' || regexpC[pos + 2] != '{')
+ return -1;
+
+ pos++;
+ const size_t startPos = pos; // 'startPos' points to 'x'
+ const size_t closingBracketPos = regexp.find('}', startPos + 2);
+ if (closingBracketPos == std::string::npos)
+ return 0; // return character zero code, leave 'pos' at 'x'
+
+ pos++; // 'pos' points to '{'
+ int chCode = 0;
+ while (++pos < closingBracketPos)
+ {
+ const int xdigitVal = StringUtils::asciixdigitvalue(regexpC[pos]);
+ if (xdigitVal >= 0)
+ chCode = chCode * 16 + xdigitVal;
+ else
+ { // found non-hexdigit
+ pos = startPos; // reset 'pos' to 'startPos', process "{hh..}" as non-code
+ return 0; // return character zero code
+ }
+ }
+
+ return chCode;
+}
+
+bool CRegExp::isCharClassWithUnicode(const std::string& regexp, size_t& pos)
+{
+ const char* const regexpC = regexp.c_str();
+ const size_t len = regexp.length();
+ if (pos > len || regexpC[pos] != '[')
+ return false;
+
+ // look for Unicode character code "\x{hhh..}" and Unicode properties "\P", "\p" and "\X"
+ // find end (terminating ']') of character class (like "[a-h45]")
+ // detect nested POSIX classes like "[[:lower:]]" and escaped brackets like "[\]]"
+ bool needUnicode = false;
+ while (++pos < len)
+ {
+ if (regexpC[pos] == '[' && regexpC[pos + 1] == ':')
+ { // possible POSIX character class, like "[:alpha:]"
+ const size_t nextClosingBracketPos = regexp.find(']', pos + 2); // don't care about "\]", as it produce error if used inside POSIX char class
+
+ if (nextClosingBracketPos == std::string::npos)
+ { // error in regexp: no closing ']' for character class
+ pos = std::string::npos;
+ return needUnicode;
+ }
+ else if (regexpC[nextClosingBracketPos - 1] == ':')
+ pos = nextClosingBracketPos; // skip POSIX character class
+ // if ":]" is not found, process "[:..." as part of normal character class
+ }
+ else if (regexpC[pos] == ']')
+ return needUnicode; // end of character class
+ else if (regexpC[pos] == '\\')
+ {
+ const char nextChar = regexpC[pos + 1];
+ if (nextChar == ']' || nextChar == '[')
+ pos++; // skip next character
+ else if (nextChar == 'Q')
+ {
+ pos = regexp.find("\\E", pos + 2);
+ if (pos == std::string::npos)
+ return needUnicode; // error in regexp: no closing "\E" after "\Q" in character class
+ else
+ pos++; // skip "\E"
+ }
+ else if (nextChar == 'p' || nextChar == 'P' || nextChar == 'X')
+ needUnicode = true; // don't care about property name as it can contain only ASCII chars
+ else if (nextChar == 'x')
+ {
+ if (readCharXCode(regexp, pos) >= 0x100)
+ needUnicode = true;
+ }
+ }
+ }
+ pos = std::string::npos; // closing square bracket was not found
+
+ return needUnicode;
+}
+
+
CRegExp::CRegExp(const CRegExp& re)
{
m_re = NULL;
m_sd = NULL;
m_jitStack = NULL;
+ m_utf8Mode = re.m_utf8Mode;
m_iOptions = re.m_iOptions;
*this = re;
}
@@ -140,10 +286,13 @@ bool CRegExp::RegComp(const char *re, studyMode study /*= NoStudy*/)
m_iMatchCount = 0;
const char *errMsg = NULL;
int errOffset = 0;
+ int options = m_iOptions;
+ if (m_utf8Mode == autoUtf8 && requireUtf8(re))
+ options |= (IsUtf8Supported() ? PCRE_UTF8 : 0) | (AreUnicodePropertiesSupported() ? PCRE_UCP : 0);
Cleanup();
- m_re = pcre_compile(re, m_iOptions, &errMsg, &errOffset, NULL);
+ m_re = pcre_compile(re, options, &errMsg, &errOffset, NULL);
if (!m_re)
{
m_pattern.clear();
diff --git a/xbmc/utils/RegExp.h b/xbmc/utils/RegExp.h
index d23166995b..de1ce287a6 100644
--- a/xbmc/utils/RegExp.h
+++ b/xbmc/utils/RegExp.h
@@ -48,25 +48,32 @@ public:
StudyRegExp = 1, // study expression (slower compilation, faster find)
StudyWithJitComp // study expression and JIT-compile it, if possible (heavyweight optimization)
};
+ enum utf8Mode
+ {
+ autoUtf8 = -1, // analyze regexp for UTF-8 multi-byte chars, for Unicode codes > 0xFF
+ // or explicit Unicode properties (\p, \P and \X), enable UTF-8 mode if any of them are found
+ asciiOnly = 0, // process regexp and strings as single-byte encoded strings
+ forceUtf8 = 1 // enable UTF-8 mode (with Unicode properties)
+ };
static const int m_MaxNumOfBackrefrences = 20;
/**
* @param caseless (optional) Matching will be case insensitive if set to true
* or case sensitive if set to false
- * @param utf8 (optional) If set to true all string will be processed as UTF-8 strings
+ * @param utf8 (optional) Control UTF-8 processing
*/
- CRegExp(bool caseless = false, bool utf8 = false);
+ CRegExp(bool caseless = false, utf8Mode utf8 = asciiOnly);
/**
* Create new CRegExp object and compile regexp expression in one step
* @warning Use only with hardcoded regexp when you're sure that regexp is compiled without errors
* @param caseless Matching will be case insensitive if set to true
* or case sensitive if set to false
- * @param utf8 If set to true all string will be processed as UTF-8 strings
+ * @param utf8 Control UTF-8 processing
* @param re The regular expression
* @param study (optional) Controls study of expression, useful if expression will be used
* several times
*/
- CRegExp(bool caseless, bool utf8, const char *re, studyMode study = NoStudy);
+ CRegExp(bool caseless, utf8Mode utf8, const char *re, studyMode study = NoStudy);
CRegExp(const CRegExp& re);
~CRegExp();
@@ -143,7 +150,10 @@ public:
private:
int PrivateRegFind(size_t bufferLen, const char *str, unsigned int startoffset = 0, int maxNumberOfCharsToTest = -1);
- void InitValues(bool caseless = false, bool utf8 = false);
+ void InitValues(bool caseless = false, CRegExp::utf8Mode utf8 = asciiOnly);
+ static bool requireUtf8(const std::string& regexp);
+ static int readCharXCode(const std::string& regexp, size_t& pos);
+ static bool isCharClassWithUnicode(const std::string& regexp, size_t& pos);
void Cleanup();
inline bool IsValidSubNumber(int iSub) const;
@@ -153,6 +163,7 @@ private:
static const int OVECCOUNT=(m_MaxNumOfBackrefrences + 1) * 3;
unsigned int m_offset;
int m_iOvector[OVECCOUNT];
+ utf8Mode m_utf8Mode;
int m_iMatchCount;
int m_iOptions;
bool m_jitCompiled;