diff options
author | Edinaldo P. Silva <edps.mundognu@gmail.com> | 2016-07-24 21:33:01 -0500 |
---|---|---|
committer | Willy Sudiarto Raharjo <willysr@slackbuilds.org> | 2016-07-30 08:28:43 +0700 |
commit | cea1efabbdc09ae5c4abc8d71472eb7d2bda323a (patch) | |
tree | 4c73668b01c3d8f8e9e80871ec5e4f2b3a76e1c8 /libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch | |
parent | 1151ce1229e92aa0ab4795ba8ff3c3b8d346d411 (diff) |
libraries/libuchardet: Updated for version 0.0.5.
Signed-off-by: Robby Workman <rworkman@slackbuilds.org>
Diffstat (limited to 'libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch')
-rw-r--r-- | libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch | 116 |
1 files changed, 116 insertions, 0 deletions
diff --git a/libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch b/libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch new file mode 100644 index 0000000000000..c82aee866ebc8 --- /dev/null +++ b/libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch @@ -0,0 +1,116 @@ +commit 4c8316f9cfda38d75fb015c0eb40e0eebb03d28f +Author: Jehan <jehan@girinstud.io> +Date: Sat Dec 5 21:04:20 2015 +0100 + + Nearly-ASCII text with NBSP is still not ASCII. + + There is no "exception" in encoding. The non-breaking space 0xA0 is not + ASCII, and therefore returning "ASCII" will later create issues (for + instance trying to re-encode with iconv produces an error). + This was obviously an explicit decision in original code (according to + code comments), probably tied to specifity of the original program from + Mozilla. Now we want strict detection. + I will return "ISO-8859-1" for "nearly-ASCII texts with NBSP as only + exception" (note that I could have returned any ISO-8859 charsets since + they all have this character in common). + +diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp +index ab8bae0..ff06b9d 100644 +--- a/src/nsUniversalDetector.cpp ++++ b/src/nsUniversalDetector.cpp +@@ -47,6 +47,7 @@ + + nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter) + { ++ mNbspFound = PR_FALSE; + mDone = PR_FALSE; + mBestGuess = -1; //illegal value as signal + mInTag = PR_FALSE; +@@ -75,6 +76,7 @@ nsUniversalDetector::~nsUniversalDetector() + void + nsUniversalDetector::Reset() + { ++ mNbspFound = PR_FALSE; + mDone = PR_FALSE; + mBestGuess = -1; //illegal value as signal + mInTag = PR_FALSE; +@@ -162,9 +164,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) + PRUint32 i; + for (i = 0; i < aLen; i++) + { +- /* Other than 0xA0, if every other character is ASCII, the page is ASCII. ++ /* If every other character is ASCII or 0xA0, we don't run charset ++ * probers. + * 0xA0 (NBSP in a few charset) is apparently a rare exception +- * of non-ASCII character contained in ASCII text. */ ++ * of non-ASCII character often contained in nearly-ASCII text. */ + if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') + { + /* We got a non-ASCII byte (high-byte) */ +@@ -203,11 +206,19 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) + } + else + { +- //ok, just pure ascii so far +- if ( ePureAscii == mInputState && +- (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) ) ++ /* Just pure ASCII or NBSP so far. */ ++ if (aBuf[i] == '\xA0') + { +- //found escape character or HZ "~{" ++ /* ASCII with the only exception of NBSP seems quite common. ++ * I doubt it is really necessary to train a model here, so let's ++ * just make an exception. ++ */ ++ mNbspFound = PR_TRUE; ++ } ++ else if (mInputState == ePureAscii && ++ (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~'))) ++ { ++ /* We found an escape character or HZ "~{". */ + mInputState = eEscAscii; + } + mLastChar = aBuf[i]; +@@ -229,6 +240,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) + mDone = PR_TRUE; + mDetectedCharset = mEscCharSetProber->GetCharSetName(); + } ++ else if (mNbspFound) ++ { ++ mDetectedCharset = "ISO-8859-1"; ++ } + else + { + /* ASCII with the ESC character (or the sequence "~{") is still +@@ -253,8 +268,17 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) + break; + + default: +- /* Pure ASCII */ +- mDetectedCharset = "ASCII"; ++ if (mNbspFound) ++ { ++ /* ISO-8859-1 is a good result candidate for ASCII + NBSP. ++ * (though it could have been any ISO-8859 encoding). */ ++ mDetectedCharset = "ISO-8859-1"; ++ } ++ else ++ { ++ /* Pure ASCII */ ++ mDetectedCharset = "ASCII"; ++ } + break; + } + return NS_OK; +diff --git a/src/nsUniversalDetector.h b/src/nsUniversalDetector.h +index 4d9b460..9f0a4b1 100644 +--- a/src/nsUniversalDetector.h ++++ b/src/nsUniversalDetector.h +@@ -72,6 +72,7 @@ protected: + virtual void Report(const char* aCharset) = 0; + virtual void Reset(); + nsInputState mInputState; ++ PRBool mNbspFound; + PRBool mDone; + PRBool mInTag; + PRBool mStart; |