00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034 #include <CLucene/StdHeader.h>
00035 #include "clucenetokenizer.h"
00036
00037 CL_NS_USE(analysis)
00038 CL_NS_USE(util)
00039
00040 namespace Nepomuk {
00041 const TCHAR* tokenImageArray[] = {
00042 _T("<EOF>"),
00043 _T("<UNKNOWN>"),
00044 _T("<ALPHANUM>"),
00045 _T("<APOSTROPHE>"),
00046 _T("<ACRONYM>"),
00047 _T("<COMPANY>"),
00048 _T("<EMAIL>"),
00049 _T("<HOST>"),
00050 _T("<NUM>"),
00051 _T("<CJK>")
00052 };
00053 const TCHAR** tokenImage = tokenImageArray;
00054 }
00055
00056
00057
00058
00059 #define EOS (ch==-1 || rd->Eos())
00060 #define SPACE (_istspace((TCHAR)ch) != 0)
00061 #define ALPHA (_istalpha((TCHAR)ch) != 0)
00062 #define ALNUM (_istalnum(ch) != 0)
00063 #define DIGIT (_istdigit(ch) != 0)
00064 #define UNDERSCORE (ch == '_')
00065
00066 #define _CJK ( (ch>=0x3040 && ch<=0x318f) || \
00067 (ch>=0x3300 && ch<=0x337f) || \
00068 (ch>=0x3400 && ch<=0x3d2d) || \
00069 (ch>=0x4e00 && ch<=0x9fff) || \
00070 (ch>=0xf900 && ch<=0xfaff) || \
00071 (ch>=0xac00 && ch<=0xd7af) ) //korean
00072
00073
00074 #define DASH (ch == '-')
00075 #define NEGATIVE_SIGN_ DASH
00076
00077
00078
00079 #define DOT (ch == '.')
00080 #define DECIMAL DOT
00081
00082
00083
00084 #define _CONSUME_AS_LONG_AS(conditionFails) while (true) { ch = readChar(); if (ch==-1 || (!(conditionFails) || str.len >= LUCENE_MAX_WORD_LEN)) { break; } str.appendChar(ch);}
00085
00086 #define CONSUME_ALPHAS _CONSUME_AS_LONG_AS(ALPHA)
00087
00088 #define CONSUME_DIGITS _CONSUME_AS_LONG_AS(DIGIT)
00089
00090
00091
00092
00093 #define CONSUME_WORD _CONSUME_AS_LONG_AS(ALNUM)
00094
00095
00096
00097
00098 #define CONSUME_CJK _CONSUME_AS_LONG_AS(_CJK)
00099
00100
00101
00102
00103
00104
00105
00106
00107 #define CONSUMED_NOTHING_OF_VALUE (rdPos == specialCharPos || (rdPos == specialCharPos+1 && ( SPACE || !(ALNUM || DOT || DASH || UNDERSCORE) )))
00108
00109 #define RIGHTMOST(sb) (sb.getBuffer()[sb.len-1])
00110 #define RIGHTMOST_IS(sb, c) (RIGHTMOST(sb) == c)
00111
00112
00113 #define SHAVE_RIGHTMOST(sb) (sb.getBuffer()[--sb.len] = '\0')
00114
00115
00116
00117
00118 #define CONTAINS_ANY(sb, ofThese) (_tcscspn(sb.getBuffer(), _T(ofThese)) != static_cast<size_t>(sb.len))
00119
00120
00121 namespace Nepomuk {
00122
00123 CLuceneTokenizer::CLuceneTokenizer(Reader* reader):
00124 rd(_CLNEW FastCharStream(reader)),
00125
00126
00127 rdPos(-1),
00128 tokenStart(-1)
00129 {
00130 }
00131
00132 CLuceneTokenizer::~CLuceneTokenizer() {
00133 _CLDELETE(rd);
00134 }
00135
00136 int CLuceneTokenizer::readChar() {
00137
00138
00139 rdPos++;
00140 return rd->GetNext();
00141 }
00142
00143 void CLuceneTokenizer::unReadChar() {
00144 rd->UnGet();
00145 rdPos--;
00146 }
00147
00148 inline bool CLuceneTokenizer::setToken(CL_NS( analysis )::Token* t, StringBuffer* sb, TokenTypes tokenCode) {
00149 t->setStartOffset(tokenStart);
00150 t->setEndOffset(tokenStart+sb->length());
00151 t->setType(tokenImage[tokenCode]);
00152 sb->getBuffer();
00153 t->resetTermTextLen();
00154 return true;
00155 }
00156
00157 bool CLuceneTokenizer::next(Token* t) {
00158 int ch=0;
00159 while (!EOS) {
00160 ch = readChar();
00161
00162 if ( ch == 0 || ch == -1 ){
00163 continue;
00164 } else if (SPACE || UNDERSCORE) {
00165 continue;
00166 } else if (ALPHA) {
00167 tokenStart = rdPos;
00168 return ReadAlphaNum(ch,t);
00169 } else if (DIGIT || NEGATIVE_SIGN_ || DECIMAL) {
00170 tokenStart = rdPos;
00171
00172
00173 if (ReadNumber(NULL, ch,t))
00174 return true;
00175 } else if ( _CJK ){
00176 if ( ReadCJK(ch,t) )
00177 return true;
00178 }
00179 }
00180 return false;
00181 }
00182
00183 bool CLuceneTokenizer::ReadNumber(const TCHAR* previousNumber, const TCHAR prev,Token* t) {
00184
00185
00186
00187
00188
00189 t->growBuffer(LUCENE_MAX_WORD_LEN+1);
00190 StringBuffer str(t->_termText,t->bufferLength(),true);
00191 TokenTypes tokenType;
00192 bool decExhausted;
00193 if (previousNumber != NULL) {
00194 str.prepend(previousNumber);
00195 tokenType = HOST;
00196 decExhausted = false;
00197 } else {
00198 tokenType = NUM;
00199 decExhausted = (prev == '.');
00200 }
00201 if ( str.len >= LUCENE_MAX_WORD_LEN ){
00202
00203
00204
00205 return false;
00206 }
00207 str.appendChar(prev);
00208
00209 const bool signExhausted = (prev == '-');
00210 int ch = prev;
00211
00212 CONSUME_DIGITS;
00213
00214 if (str.len < 2
00215 && (
00216 (signExhausted && !DECIMAL)
00217 || (decExhausted )
00218 )
00219 )
00220 {
00221
00222
00223
00224
00225 if (!EOS) {
00226
00227 unReadChar();
00228 }
00229 return false;
00230 }
00231
00232
00233
00234 if (!EOS) {
00235 if (DECIMAL) {
00236 if ( str.len >= LUCENE_MAX_WORD_LEN )
00237 return false;
00238 str.appendChar(ch);
00239 } else {
00240 unReadChar();
00241 goto SUCCESSFULLY_EXTRACTED_NUMBER;
00242 }
00243
00244 CONSUME_DIGITS;
00245 if (!DIGIT && !DECIMAL) {
00246 unReadChar();
00247 } else if (!EOS && DECIMAL && _istdigit(rd->Peek())) {
00248
00249
00250
00251 return ReadNumber(str.getBuffer(), '.',t);
00252 }
00253 }
00254
00255 SUCCESSFULLY_EXTRACTED_NUMBER:
00256 TCHAR rightmost = RIGHTMOST(str);
00257
00258 if (rightmost == '.') {
00259 SHAVE_RIGHTMOST(str);
00260 unReadChar();
00261 rightmost = RIGHTMOST(str);
00262 }
00263
00264 if (rightmost == '-') {
00265 CND_PRECONDITION (str.len == 1, "Number is invalid");
00266 return false;
00267 }
00268
00269 return setToken(t,&str,tokenType);
00270 }
00271
00272 bool CLuceneTokenizer::ReadAlphaNum(const TCHAR prev, Token* t) {
00273 t->growBuffer(LUCENE_MAX_WORD_LEN+1);
00274 StringBuffer str(t->_termText,t->bufferLength(),true);
00275 if ( str.len < LUCENE_MAX_WORD_LEN ){
00276 str.appendChar(prev);
00277 int ch = prev;
00278
00279 CONSUME_WORD;
00280 if (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) {
00281 switch(ch) {
00282 #if 0
00283 case '.':
00284 str.appendChar('.');
00285 return ReadDotted(&str, UNKNOWN,t);
00286 #endif
00287 case '\'':
00288 str.appendChar('\'');
00289 return ReadApostrophe(&str,t);
00290 case '@':
00291 str.appendChar('@');
00292 return ReadAt(&str,t);
00293 case '&':
00294 str.appendChar('&');
00295 return ReadCompany(&str,t);
00296
00297 }
00298 }
00299 }
00300 return setToken(t,&str,ALPHANUM);
00301 }
00302
00303 bool CLuceneTokenizer::ReadCJK(const TCHAR prev, Token* t) {
00304 t->growBuffer(LUCENE_MAX_WORD_LEN+1);
00305 StringBuffer str(t->_termText,t->bufferLength(),true);
00306 if ( str.len < LUCENE_MAX_WORD_LEN ){
00307 str.appendChar(prev);
00308 int ch = prev;
00309
00310 CONSUME_CJK;
00311 }
00312 return setToken(t,&str,CJK);
00313 }
00314
00315
00316 bool CLuceneTokenizer::ReadDotted(StringBuffer* _str, TokenTypes forcedType, CL_NS(analysis)::Token* t) {
00317 const int32_t specialCharPos = rdPos;
00318 StringBuffer& str=*_str;
00319
00320
00321
00322
00323
00324 int ch = rd->Peek();
00325 if (!(DOT || DASH)) {
00326 bool prevWasDot;
00327 bool prevWasDash;
00328 if (str.len == 0) {
00329 prevWasDot = false;
00330 prevWasDash = false;
00331 } else {
00332 prevWasDot = RIGHTMOST(str) == '.';
00333 prevWasDash = RIGHTMOST(str) == '-';
00334 }
00335 while (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) {
00336 ch = readChar();
00337 const bool dot = ch == '.';
00338 const bool dash = ch == '-';
00339
00340 if (!(ALNUM || UNDERSCORE || dot || dash)) {
00341 break;
00342 }
00343
00344
00345
00346
00347 if ((dot || dash) && (prevWasDot || prevWasDash)) {
00348
00349
00350
00351
00352 if (!prevWasDot) {
00353 SHAVE_RIGHTMOST(str);
00354 }
00355 break;
00356 }
00357
00358 str.appendChar(ch);
00359
00360 prevWasDot = dot;
00361 prevWasDash = dash;
00362 }
00363 }
00364
00365
00366
00367
00368
00369
00370
00371
00372 {
00373 TCHAR* strBuf = str.getBuffer();
00374
00375 bool rightmostIsDot = RIGHTMOST_IS(str, '.');
00376 if (CONSUMED_NOTHING_OF_VALUE) {
00377
00378 if (rightmostIsDot) {
00379 SHAVE_RIGHTMOST(str);
00380 }
00381
00382 if (_tcschr(strBuf, '.') == NULL) {
00383 forcedType = ALPHANUM;
00384 }
00385
00386
00387
00388
00389 } else if (rightmostIsDot) {
00390 bool isAcronym = true;
00391 const int32_t upperCheckLimit = str.len - 1;
00392
00393 for (int32_t i = 0; i < upperCheckLimit; i++) {
00394 const bool even = (i % 2 == 0);
00395 ch = strBuf[i];
00396 if ( (even && !ALPHA) || (!even && !DOT) ) {
00397 isAcronym = false;
00398 break;
00399 }
00400 }
00401 if (isAcronym) {
00402 forcedType = ACRONYM;
00403 } else {
00404
00405 SHAVE_RIGHTMOST(str);
00406
00407 if (_tcschr(strBuf, '.') == NULL) {
00408 forcedType = ALPHANUM;
00409 }
00410 }
00411 }
00412 }
00413
00414 if (!EOS) {
00415 if (ch == '@' && str.len < LUCENE_MAX_WORD_LEN-1) {
00416 str.appendChar('@');
00417 return ReadAt(&str,t);
00418 } else {
00419 unReadChar();
00420 }
00421 }
00422
00423 return setToken(t,&str,UNKNOWN
00424 ? forcedType : HOST);
00425 }
00426
00427 bool CLuceneTokenizer::ReadApostrophe(StringBuffer* _str, Token* t) {
00428 StringBuffer& str=*_str;
00429
00430 TokenTypes tokenType = APOSTROPHE;
00431 const int32_t specialCharPos = rdPos;
00432 int ch=0;
00433
00434 CONSUME_ALPHAS;
00435 if (RIGHTMOST_IS(str, '\'') || CONSUMED_NOTHING_OF_VALUE) {
00436
00437
00438 SHAVE_RIGHTMOST(str);
00439 tokenType = ALPHANUM;
00440 }
00441 if (!EOS) {
00442 unReadChar();
00443 }
00444
00445 return setToken(t,&str,tokenType);
00446 }
00447
00448 bool CLuceneTokenizer::ReadAt(StringBuffer* str, Token* t) {
00449 ReadDotted(str, EMAIL,t);
00450
00451 if (!CONTAINS_ANY((*str), ".0123456789")) {
00452 setToken(t,str,COMPANY);
00453 }
00454 return true;
00455 }
00456
00457 bool CLuceneTokenizer::ReadCompany(StringBuffer* _str, Token* t) {
00458 StringBuffer& str = *_str;
00459 const int32_t specialCharPos = rdPos;
00460 int ch=0;
00461
00462 CONSUME_WORD;
00463 if (CONSUMED_NOTHING_OF_VALUE) {
00464
00465
00466 CND_PRECONDITION(RIGHTMOST_IS(str, '&'),"ReadCompany failed");
00467 SHAVE_RIGHTMOST(str);
00468
00469
00470 return setToken(t,&str,ALPHANUM);
00471 }
00472 if (!EOS) {
00473 unReadChar();
00474 }
00475
00476 return setToken(t,&str,COMPANY);
00477 }
00478 }