diff options
Diffstat (limited to 'libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch')
-rw-r--r-- | libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch | 116 |
1 files changed, 0 insertions, 116 deletions
diff --git a/libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch b/libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch deleted file mode 100644 index c82aee866ebc8..0000000000000 --- a/libraries/libuchardet/uchardet-0.0.5-fix-ASCII-detection.patch +++ /dev/null @@ -1,116 +0,0 @@ -commit 4c8316f9cfda38d75fb015c0eb40e0eebb03d28f -Author: Jehan <jehan@girinstud.io> -Date: Sat Dec 5 21:04:20 2015 +0100 - - Nearly-ASCII text with NBSP is still not ASCII. - - There is no "exception" in encoding. The non-breaking space 0xA0 is not - ASCII, and therefore returning "ASCII" will later create issues (for - instance trying to re-encode with iconv produces an error). - This was obviously an explicit decision in original code (according to - code comments), probably tied to specifity of the original program from - Mozilla. Now we want strict detection. - I will return "ISO-8859-1" for "nearly-ASCII texts with NBSP as only - exception" (note that I could have returned any ISO-8859 charsets since - they all have this character in common). - -diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp -index ab8bae0..ff06b9d 100644 ---- a/src/nsUniversalDetector.cpp -+++ b/src/nsUniversalDetector.cpp -@@ -47,6 +47,7 @@ - - nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter) - { -+ mNbspFound = PR_FALSE; - mDone = PR_FALSE; - mBestGuess = -1; //illegal value as signal - mInTag = PR_FALSE; -@@ -75,6 +76,7 @@ nsUniversalDetector::~nsUniversalDetector() - void - nsUniversalDetector::Reset() - { -+ mNbspFound = PR_FALSE; - mDone = PR_FALSE; - mBestGuess = -1; //illegal value as signal - mInTag = PR_FALSE; -@@ -162,9 +164,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) - PRUint32 i; - for (i = 0; i < aLen; i++) - { -- /* Other than 0xA0, if every other character is ASCII, the page is ASCII. -+ /* If every other character is ASCII or 0xA0, we don't run charset -+ * probers. - * 0xA0 (NBSP in a few charset) is apparently a rare exception -- * of non-ASCII character contained in ASCII text. */ -+ * of non-ASCII character often contained in nearly-ASCII text. */ - if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') - { - /* We got a non-ASCII byte (high-byte) */ -@@ -203,11 +206,19 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) - } - else - { -- //ok, just pure ascii so far -- if ( ePureAscii == mInputState && -- (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) ) -+ /* Just pure ASCII or NBSP so far. */ -+ if (aBuf[i] == '\xA0') - { -- //found escape character or HZ "~{" -+ /* ASCII with the only exception of NBSP seems quite common. -+ * I doubt it is really necessary to train a model here, so let's -+ * just make an exception. -+ */ -+ mNbspFound = PR_TRUE; -+ } -+ else if (mInputState == ePureAscii && -+ (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~'))) -+ { -+ /* We found an escape character or HZ "~{". */ - mInputState = eEscAscii; - } - mLastChar = aBuf[i]; -@@ -229,6 +240,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) - mDone = PR_TRUE; - mDetectedCharset = mEscCharSetProber->GetCharSetName(); - } -+ else if (mNbspFound) -+ { -+ mDetectedCharset = "ISO-8859-1"; -+ } - else - { - /* ASCII with the ESC character (or the sequence "~{") is still -@@ -253,8 +268,17 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) - break; - - default: -- /* Pure ASCII */ -- mDetectedCharset = "ASCII"; -+ if (mNbspFound) -+ { -+ /* ISO-8859-1 is a good result candidate for ASCII + NBSP. -+ * (though it could have been any ISO-8859 encoding). */ -+ mDetectedCharset = "ISO-8859-1"; -+ } -+ else -+ { -+ /* Pure ASCII */ -+ mDetectedCharset = "ASCII"; -+ } - break; - } - return NS_OK; -diff --git a/src/nsUniversalDetector.h b/src/nsUniversalDetector.h -index 4d9b460..9f0a4b1 100644 ---- a/src/nsUniversalDetector.h -+++ b/src/nsUniversalDetector.h -@@ -72,6 +72,7 @@ protected: - virtual void Report(const char* aCharset) = 0; - virtual void Reset(); - nsInputState mInputState; -+ PRBool mNbspFound; - PRBool mDone; - PRBool mInTag; - PRBool mStart; |