KDECore
JapaneseGroupProber.cpp
Go to the documentation of this file.00001 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 00002 /* -*- C++ -*- 00003 * Copyright (C) 1998 <developer@mozilla.org> 00004 * 00005 * 00006 * Permission is hereby granted, free of charge, to any person obtaining 00007 * a copy of this software and associated documentation files (the 00008 * "Software"), to deal in the Software without restriction, including 00009 * without limitation the rights to use, copy, modify, merge, publish, 00010 * distribute, sublicense, and/or sell copies of the Software, and to 00011 * permit persons to whom the Software is furnished to do so, subject to 00012 * the following conditions: 00013 * 00014 * The above copyright notice and this permission notice shall be included 00015 * in all copies or substantial portions of the Software. 00016 * 00017 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 00018 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 00019 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 00020 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 00021 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 00022 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 00023 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 00024 */ 00025 00026 #include "JapaneseGroupProber.h" 00027 00028 #include "UnicodeGroupProber.h" 00029 #include "nsSJISProber.h" 00030 #include "nsEUCJPProber.h" 00031 00032 #include <stdio.h> 00033 #include <stdlib.h> 00034 00035 namespace kencodingprober { 00036 #ifdef DEBUG_PROBE 00037 static const char* const ProberName[] = 00038 { 00039 "Unicode", 00040 "GB18030", 00041 "Big5", 00042 }; 00043 00044 #endif 00045 00046 JapaneseGroupProber::JapaneseGroupProber() 00047 { 00048 mProbers[0] = new UnicodeGroupProber(); 00049 mProbers[1] = new nsSJISProber(); 00050 mProbers[2] = new nsEUCJPProber(); 00051 Reset(); 00052 } 00053 00054 JapaneseGroupProber::~JapaneseGroupProber() 00055 { 00056 for (unsigned int i = 0; i < JP_NUM_OF_PROBERS; i++) 00057 { 00058 delete mProbers[i]; 00059 } 00060 } 00061 00062 const char* JapaneseGroupProber::GetCharSetName() 00063 { 00064 if (mBestGuess == -1) 00065 { 00066 GetConfidence(); 00067 if (mBestGuess == -1) 00068 mBestGuess = 1; // assume it's GB18030 00069 } 00070 return mProbers[mBestGuess]->GetCharSetName(); 00071 } 00072 00073 void JapaneseGroupProber::Reset(void) 00074 { 00075 mActiveNum = 0; 00076 for (unsigned int i = 0; i < JP_NUM_OF_PROBERS; i++) 00077 { 00078 if (mProbers[i]) 00079 { 00080 mProbers[i]->Reset(); 00081 mIsActive[i] = true; 00082 ++mActiveNum; 00083 } 00084 else 00085 mIsActive[i] = false; 00086 } 00087 mBestGuess = -1; 00088 mState = eDetecting; 00089 } 00090 00091 nsProbingState JapaneseGroupProber::HandleData(const char* aBuf, unsigned int aLen) 00092 { 00093 nsProbingState st; 00094 unsigned int i; 00095 00096 //do filtering to reduce load to probers 00097 char *highbyteBuf; 00098 char *hptr; 00099 bool keepNext = true; //assume previous is not ascii, it will do no harm except add some noise 00100 hptr = highbyteBuf = (char*)malloc(aLen); 00101 if (!hptr) 00102 return mState; 00103 for (i = 0; i < aLen; ++i) 00104 { 00105 if (aBuf[i] & 0x80) 00106 { 00107 *hptr++ = aBuf[i]; 00108 keepNext = true; 00109 } 00110 else 00111 { 00112 //if previous is highbyte, keep this even it is a ASCII 00113 if (keepNext) 00114 { 00115 *hptr++ = aBuf[i]; 00116 keepNext = false; 00117 } 00118 } 00119 } 00120 00121 for (i = 0; i < JP_NUM_OF_PROBERS; ++i) 00122 { 00123 if (!mIsActive[i]) 00124 continue; 00125 st = mProbers[i]->HandleData(highbyteBuf, hptr - highbyteBuf); 00126 if (st == eFoundIt) 00127 { 00128 mBestGuess = i; 00129 mState = eFoundIt; 00130 break; 00131 } 00132 else if (st == eNotMe) 00133 { 00134 mIsActive[i] = false; 00135 mActiveNum--; 00136 if (mActiveNum <= 0) 00137 { 00138 mState = eNotMe; 00139 break; 00140 } 00141 } 00142 } 00143 00144 free(highbyteBuf); 00145 00146 return mState; 00147 } 00148 00149 float JapaneseGroupProber::GetConfidence(void) 00150 { 00151 unsigned int i; 00152 float bestConf = 0.0, cf; 00153 00154 switch (mState) 00155 { 00156 case eFoundIt: 00157 return (float)0.99; 00158 case eNotMe: 00159 return (float)0.01; 00160 default: 00161 for (i = 0; i < JP_NUM_OF_PROBERS; ++i) 00162 { 00163 if (!mIsActive[i]) 00164 continue; 00165 cf = mProbers[i]->GetConfidence(); 00166 if (bestConf < cf) 00167 { 00168 bestConf = cf; 00169 mBestGuess = i; 00170 } 00171 } 00172 } 00173 return bestConf; 00174 } 00175 00176 #ifdef DEBUG_PROBE 00177 void JapaneseGroupProber::DumpStatus() 00178 { 00179 unsigned int i; 00180 float cf; 00181 00182 GetConfidence(); 00183 for (i = 0; i < JP_NUM_OF_PROBERS; i++) 00184 { 00185 if (!mIsActive[i]) 00186 printf(" Chinese group inactive: [%s] (confidence is too low).\r\n", ProberName[i]); 00187 else 00188 { 00189 cf = mProbers[i]->GetConfidence(); 00190 printf(" Chinese group %1.3f: [%s]\r\n", cf, ProberName[i]); 00191 } 00192 } 00193 } 00194 #endif 00195 }