summaryrefslogtreecommitdiff
path: root/libcharsetdetect/mozilla/extensions/universalchardet/src/base/nsUniversalDetector.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'libcharsetdetect/mozilla/extensions/universalchardet/src/base/nsUniversalDetector.cpp')
-rw-r--r--libcharsetdetect/mozilla/extensions/universalchardet/src/base/nsUniversalDetector.cpp295
1 files changed, 295 insertions, 0 deletions
diff --git a/libcharsetdetect/mozilla/extensions/universalchardet/src/base/nsUniversalDetector.cpp b/libcharsetdetect/mozilla/extensions/universalchardet/src/base/nsUniversalDetector.cpp
new file mode 100644
index 0000000..cd6053e
--- /dev/null
+++ b/libcharsetdetect/mozilla/extensions/universalchardet/src/base/nsUniversalDetector.cpp
@@ -0,0 +1,295 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is Mozilla Universal charset detector code.
+ *
+ * The Initial Developer of the Original Code is
+ * Netscape Communications Corporation.
+ * Portions created by the Initial Developer are Copyright (C) 2001
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ * Shy Shalom <shooshX@gmail.com>
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+#include "nscore.h"
+
+#include "nsUniversalDetector.h"
+
+#include "nsMBCSGroupProber.h"
+#include "nsSBCSGroupProber.h"
+#include "nsEscCharsetProber.h"
+#include "nsLatin1Prober.h"
+
+nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
+{
+ mDone = PR_FALSE;
+ mBestGuess = -1; //illegal value as signal
+ mInTag = PR_FALSE;
+ mEscCharSetProber = nsnull;
+
+ mStart = PR_TRUE;
+ mDetectedCharset = nsnull;
+ mGotData = PR_FALSE;
+ mInputState = ePureAscii;
+ mLastChar = '\0';
+ mLanguageFilter = aLanguageFilter;
+
+ PRUint32 i;
+ for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
+ mCharSetProbers[i] = nsnull;
+}
+
+nsUniversalDetector::~nsUniversalDetector()
+{
+ for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
+ if (mCharSetProbers[i])
+ delete mCharSetProbers[i];
+ if (mEscCharSetProber)
+ delete mEscCharSetProber;
+}
+
+void
+nsUniversalDetector::Reset()
+{
+ mDone = PR_FALSE;
+ mBestGuess = -1; //illegal value as signal
+ mInTag = PR_FALSE;
+
+ mStart = PR_TRUE;
+ mDetectedCharset = nsnull;
+ mGotData = PR_FALSE;
+ mInputState = ePureAscii;
+ mLastChar = '\0';
+
+ if (mEscCharSetProber)
+ mEscCharSetProber->Reset();
+
+ PRUint32 i;
+ for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
+ if (mCharSetProbers[i])
+ mCharSetProbers[i]->Reset();
+}
+
+//---------------------------------------------------------------------
+#define SHORTCUT_THRESHOLD (float)0.95
+#define MINIMUM_THRESHOLD (float)0.20
+
+nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
+{
+ if(mDone)
+ return NS_OK;
+
+ if (aLen > 0)
+ mGotData = PR_TRUE;
+
+ //If the data starts with BOM, we know it is UTF
+ if (mStart)
+ {
+ mStart = PR_FALSE;
+ if (aLen > 3)
+ switch (aBuf[0])
+ {
+ case '\xEF':
+ if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
+ // EF BB BF UTF-8 encoded BOM
+ mDetectedCharset = "UTF-8";
+ break;
+ case '\xFE':
+ if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
+ // FE FF 00 00 UCS-4, unusual octet order BOM (3412)
+ mDetectedCharset = "X-ISO-10646-UCS-4-3412";
+ else if ('\xFF' == aBuf[1])
+ // FE FF UTF-16, big endian BOM
+ mDetectedCharset = "UTF-16BE";
+ break;
+ case '\x00':
+ if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
+ // 00 00 FE FF UTF-32, big-endian BOM
+ mDetectedCharset = "UTF-32BE";
+ else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
+ // 00 00 FF FE UCS-4, unusual octet order BOM (2143)
+ mDetectedCharset = "X-ISO-10646-UCS-4-2143";
+ break;
+ case '\xFF':
+ if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
+ // FF FE 00 00 UTF-32, little-endian BOM
+ mDetectedCharset = "UTF-32LE";
+ else if ('\xFE' == aBuf[1])
+ // FF FE UTF-16, little endian BOM
+ mDetectedCharset = "UTF-16LE";
+ break;
+ } // switch
+
+ if (mDetectedCharset)
+ {
+ mDone = PR_TRUE;
+ return NS_OK;
+ }
+ }
+
+ PRUint32 i;
+ for (i = 0; i < aLen; i++)
+ {
+ //other than 0xa0, if every othe character is ascii, the page is ascii
+ if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP
+ {
+ //we got a non-ascii byte (high-byte)
+ if (mInputState != eHighbyte)
+ {
+ //adjust state
+ mInputState = eHighbyte;
+
+ //kill mEscCharSetProber if it is active
+ if (mEscCharSetProber) {
+ delete mEscCharSetProber;
+ mEscCharSetProber = nsnull;
+ }
+
+ //start multibyte and singlebyte charset prober
+ if (nsnull == mCharSetProbers[0])
+ {
+ mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter);
+ if (nsnull == mCharSetProbers[0])
+ return NS_ERROR_OUT_OF_MEMORY;
+ }
+ if (nsnull == mCharSetProbers[1] &&
+ (mLanguageFilter & NS_FILTER_NON_CJK))
+ {
+ mCharSetProbers[1] = new nsSBCSGroupProber;
+ if (nsnull == mCharSetProbers[1])
+ return NS_ERROR_OUT_OF_MEMORY;
+ }
+ if (nsnull == mCharSetProbers[2])
+ {
+ mCharSetProbers[2] = new nsLatin1Prober;
+ if (nsnull == mCharSetProbers[2])
+ return NS_ERROR_OUT_OF_MEMORY;
+ }
+ }
+ }
+ else
+ {
+ //ok, just pure ascii so far
+ if ( ePureAscii == mInputState &&
+ (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
+ {
+ //found escape character or HZ "~{"
+ mInputState = eEscAscii;
+ }
+ mLastChar = aBuf[i];
+ }
+ }
+
+ nsProbingState st;
+ switch (mInputState)
+ {
+ case eEscAscii:
+ if (nsnull == mEscCharSetProber) {
+ mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter);
+ if (nsnull == mEscCharSetProber)
+ return NS_ERROR_OUT_OF_MEMORY;
+ }
+ st = mEscCharSetProber->HandleData(aBuf, aLen);
+ if (st == eFoundIt)
+ {
+ mDone = PR_TRUE;
+ mDetectedCharset = mEscCharSetProber->GetCharSetName();
+ }
+ break;
+ case eHighbyte:
+ for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
+ {
+ if (mCharSetProbers[i])
+ {
+ st = mCharSetProbers[i]->HandleData(aBuf, aLen);
+ if (st == eFoundIt)
+ {
+ mDone = PR_TRUE;
+ mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
+ return NS_OK;
+ }
+ }
+ }
+ break;
+
+ default: //pure ascii
+ ;//do nothing here
+ }
+ return NS_OK;
+}
+
+
+//---------------------------------------------------------------------
+void nsUniversalDetector::DataEnd()
+{
+ if (!mGotData)
+ {
+ // we haven't got any data yet, return immediately
+ // caller program sometimes call DataEnd before anything has been sent to detector
+ return;
+ }
+
+ if (mDetectedCharset)
+ {
+ mDone = PR_TRUE;
+ Report(mDetectedCharset);
+ return;
+ }
+
+ switch (mInputState)
+ {
+ case eHighbyte:
+ {
+ float proberConfidence;
+ float maxProberConfidence = (float)0.0;
+ PRInt32 maxProber = 0;
+
+ for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
+ {
+ if (mCharSetProbers[i])
+ {
+ proberConfidence = mCharSetProbers[i]->GetConfidence();
+ if (proberConfidence > maxProberConfidence)
+ {
+ maxProberConfidence = proberConfidence;
+ maxProber = i;
+ }
+ }
+ }
+ //do not report anything because we are not confident of it, that's in fact a negative answer
+ if (maxProberConfidence > MINIMUM_THRESHOLD)
+ Report(mCharSetProbers[maxProber]->GetCharSetName());
+ }
+ break;
+ case eEscAscii:
+ break;
+ default:
+ ;
+ }
+ return;
+}