summaryrefslogtreecommitdiff
path: root/libcharsetdetect/mozilla/extensions/universalchardet/src/base/JpCntx.h
diff options
context:
space:
mode:
Diffstat (limited to 'libcharsetdetect/mozilla/extensions/universalchardet/src/base/JpCntx.h')
-rw-r--r--libcharsetdetect/mozilla/extensions/universalchardet/src/base/JpCntx.h137
1 files changed, 137 insertions, 0 deletions
diff --git a/libcharsetdetect/mozilla/extensions/universalchardet/src/base/JpCntx.h b/libcharsetdetect/mozilla/extensions/universalchardet/src/base/JpCntx.h
new file mode 100644
index 0000000..1760b75
--- /dev/null
+++ b/libcharsetdetect/mozilla/extensions/universalchardet/src/base/JpCntx.h
@@ -0,0 +1,137 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is Mozilla Communicator client code.
+ *
+ * The Initial Developer of the Original Code is
+ * Netscape Communications Corporation.
+ * Portions created by the Initial Developer are Copyright (C) 1998
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+#ifndef __JPCNTX_H__
+#define __JPCNTX_H__
+
+#define NUM_OF_CATEGORY 6
+
+#include "nscore.h"
+
+#define ENOUGH_REL_THRESHOLD 100
+#define MAX_REL_THRESHOLD 1000
+
+//hiragana frequency category table
+extern char jp2CharContext[83][83];
+
+class JapaneseContextAnalysis
+{
+public:
+ JapaneseContextAnalysis() {Reset();}
+
+ void HandleData(const char* aBuf, PRUint32 aLen);
+
+ void HandleOneChar(const char* aStr, PRUint32 aCharLen)
+ {
+ PRInt32 order;
+
+ //if we received enough data, stop here
+ if (mTotalRel > MAX_REL_THRESHOLD) mDone = PR_TRUE;
+ if (mDone) return;
+
+ //Only 2-bytes characters are of our interest
+ order = (aCharLen == 2) ? GetOrder(aStr) : -1;
+ if (order != -1 && mLastCharOrder != -1)
+ {
+ mTotalRel++;
+ //count this sequence to its category counter
+ mRelSample[jp2CharContext[mLastCharOrder][order]]++;
+ }
+ mLastCharOrder = order;
+ }
+
+ float GetConfidence();
+ void Reset(void);
+ void SetOpion(){}
+ PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;}
+
+protected:
+ virtual PRInt32 GetOrder(const char* str, PRUint32 *charLen) = 0;
+ virtual PRInt32 GetOrder(const char* str) = 0;
+
+ //category counters, each integer counts sequences in its category
+ PRUint32 mRelSample[NUM_OF_CATEGORY];
+
+ //total sequence received
+ PRUint32 mTotalRel;
+
+ //The order of previous char
+ PRInt32 mLastCharOrder;
+
+ //if last byte in current buffer is not the last byte of a character, we
+ //need to know how many byte to skip in next buffer.
+ PRUint32 mNeedToSkipCharNum;
+
+ //If this flag is set to PR_TRUE, detection is done and conclusion has been made
+ PRBool mDone;
+};
+
+
+class SJISContextAnalysis : public JapaneseContextAnalysis
+{
+ //SJISContextAnalysis(){};
+protected:
+ PRInt32 GetOrder(const char* str, PRUint32 *charLen);
+
+ PRInt32 GetOrder(const char* str)
+ {
+ //We only interested in Hiragana, so first byte is '\202'
+ if (*str == '\202' &&
+ (unsigned char)*(str+1) >= (unsigned char)0x9f &&
+ (unsigned char)*(str+1) <= (unsigned char)0xf1)
+ return (unsigned char)*(str+1) - (unsigned char)0x9f;
+ return -1;
+ }
+};
+
+class EUCJPContextAnalysis : public JapaneseContextAnalysis
+{
+protected:
+ PRInt32 GetOrder(const char* str, PRUint32 *charLen);
+ PRInt32 GetOrder(const char* str)
+ //We only interested in Hiragana, so first byte is '\244'
+ {
+ if (*str == '\244' &&
+ (unsigned char)*(str+1) >= (unsigned char)0xa1 &&
+ (unsigned char)*(str+1) <= (unsigned char)0xf3)
+ return (unsigned char)*(str+1) - (unsigned char)0xa1;
+ return -1;
+ }
+};
+
+#endif /* __JPCNTX_H__ */
+