• Skip to content
  • Skip to link menu
KDE 4.1 API Reference
  • KDE API Reference
  • API Reference
  • Sitemap
  • Contact Us
 

NepomukDaemons

clucenetokenizer.cpp

Go to the documentation of this file.
00001 /*
00002  * Modified version of StandardTokenizer.cpp for Nepomuk mostly to optimize for filename indexing
00003  * Copyright (C) 2008 Sebastian Trueg <trueg@kde.org>
00004  *
00005  * Based on StandardTokenizer.cpp from the CLucene package.
00006  * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
00007  *
00008  * This library is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU Library General Public
00010  * License as published by the Free Software Foundation; either
00011  * version 2 of the License, or (at your option) any later version.
00012  *
00013  * This library is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016  * Library General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU Library General Public License
00019  * along with this library; see the file COPYING.LIB.  If not, write to
00020  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00021  * Boston, MA 02110-1301, USA.
00022  */
00023 
00024 /*
00025  * Modified version for Nepomuk mostly to optimize for filename indexing
00026  * Changes:
00027  * - underscore is treated as space, ie. tokens are always divided at underscores
00028  * - try to always divide tokens at dots.
00029  *   FIXME: we should still allow the acronym detection
00030  *   FIXME: is it possible to extract both? for example email addresses. Here it would
00031  *          be nice to have both the full address as a token and the separated portions.
00032  */
00033 
00034 #include <CLucene/StdHeader.h>
00035 #include "clucenetokenizer.h"
00036 
00037 CL_NS_USE(analysis)
00038 CL_NS_USE(util)
00039 
00040 namespace Nepomuk {
00041     const TCHAR* tokenImageArray[] = {
00042         _T("<EOF>"),
00043         _T("<UNKNOWN>"),
00044         _T("<ALPHANUM>"),
00045         _T("<APOSTROPHE>"),
00046         _T("<ACRONYM>"),
00047         _T("<COMPANY>"),
00048         _T("<EMAIL>"),
00049         _T("<HOST>"),
00050         _T("<NUM>"),
00051         _T("<CJK>")
00052     };
00053     const TCHAR** tokenImage = tokenImageArray;
00054 }
00055 
00056 
00057 /* A bunch of shortcut macros, many of which make assumptions about variable
00058 ** names.  These macros enhance readability, not just convenience! */
00059 #define EOS           (ch==-1 || rd->Eos())
00060 #define SPACE         (_istspace((TCHAR)ch) != 0)
00061 #define ALPHA         (_istalpha((TCHAR)ch) != 0)
00062 #define ALNUM         (_istalnum(ch) != 0)
00063 #define DIGIT         (_istdigit(ch) != 0)
00064 #define UNDERSCORE    (ch == '_')
00065 
00066 #define _CJK            (  (ch>=0x3040 && ch<=0x318f) ||            \
00067                            (ch>=0x3300 && ch<=0x337f) ||            \
00068                            (ch>=0x3400 && ch<=0x3d2d) ||            \
00069                            (ch>=0x4e00 && ch<=0x9fff) ||            \
00070                            (ch>=0xf900 && ch<=0xfaff) ||            \
00071                            (ch>=0xac00 && ch<=0xd7af) ) //korean
00072 
00073 
00074 #define DASH          (ch == '-')
00075 #define NEGATIVE_SIGN_ DASH
00076 //#define POSITIVE_SIGN_ (ch == '+')
00077 //#define SIGN          (NEGATIVE_SIGN_ || POSITIVE_SIGN_)
00078 
00079 #define DOT             (ch == '.')
00080 #define DECIMAL         DOT
00081 
00082 
00083 //freebsd seems to have a problem with defines over multiple lines, so this has to be one long line
00084 #define _CONSUME_AS_LONG_AS(conditionFails) while (true) { ch = readChar(); if (ch==-1 || (!(conditionFails) || str.len >= LUCENE_MAX_WORD_LEN)) { break; } str.appendChar(ch);}
00085 
00086 #define CONSUME_ALPHAS _CONSUME_AS_LONG_AS(ALPHA)
00087 
00088 #define CONSUME_DIGITS _CONSUME_AS_LONG_AS(DIGIT)
00089 
00090 /* otherMatches is a condition (possibly compound) under which a character
00091 ** that's not an ALNUM can be considered not to break the
00092 ** span.  Callers should pass false if only ALNUM are acceptable. */
00093 #define CONSUME_WORD                  _CONSUME_AS_LONG_AS(ALNUM)
00094 
00095 /*
00096 ** Consume CJK characters
00097 */
00098 #define CONSUME_CJK                   _CONSUME_AS_LONG_AS(_CJK)
00099 
00100 
00101 /* It is considered that "nothing of value" has been read if:
00102 ** a) The "read head" hasn't moved since specialCharPos was established.
00103 ** or
00104 ** b) The "read head" has moved by one character, but that character was
00105 **    either whitespace or not among the characters found in the body of
00106 **    a token (deliberately doesn't include the likes of '@'/'&'). */
00107 #define CONSUMED_NOTHING_OF_VALUE (rdPos == specialCharPos || (rdPos == specialCharPos+1 && ( SPACE || !(ALNUM || DOT || DASH || UNDERSCORE) )))
00108 
00109 #define RIGHTMOST(sb) (sb.getBuffer()[sb.len-1])
00110 #define RIGHTMOST_IS(sb, c) (RIGHTMOST(sb) == c)
00111 /* To discard the last character in a StringBuffer, we decrement the buffer's
00112 ** length indicator and move the terminator back by one character. */
00113 #define SHAVE_RIGHTMOST(sb) (sb.getBuffer()[--sb.len] = '\0')
00114 
00115 //#define REMOVE_TRAILING_CHARS(sb, charMatchesCondition) { TCHAR* sbBuf = sb.getBuffer(); for (int32_t i = sb.len-1; i >= 0; i--) { TCHAR c = sbBuf[i]; if (charMatchesCondition) { sbBuf[--sb.len] = '\0'; } else {break;}}}
00116 
00117 /* Does StringBuffer sb contain any of the characters in string ofThese? */
00118 #define CONTAINS_ANY(sb, ofThese) (_tcscspn(sb.getBuffer(), _T(ofThese)) != static_cast<size_t>(sb.len))
00119 
00120 
00121 namespace Nepomuk {
00122 
00123     CLuceneTokenizer::CLuceneTokenizer(Reader* reader):
00124         rd(_CLNEW FastCharStream(reader)),
00125         /* rdPos is zero-based.  It starts at -1, and will advance to the first
00126         ** position when readChar() is first called. */
00127         rdPos(-1),
00128         tokenStart(-1)
00129     {
00130     }
00131 
00132     CLuceneTokenizer::~CLuceneTokenizer() {
00133         _CLDELETE(rd);
00134     }
00135 
00136     int CLuceneTokenizer::readChar() {
00137         /* Increment by 1 because we're speaking in terms of characters, not
00138         ** necessarily bytes: */
00139         rdPos++;
00140         return rd->GetNext();
00141     }
00142 
00143     void CLuceneTokenizer::unReadChar() {
00144         rd->UnGet();
00145         rdPos--;
00146     }
00147 
00148     inline bool CLuceneTokenizer::setToken(CL_NS( analysis )::Token* t, StringBuffer* sb, TokenTypes tokenCode) {
00149         t->setStartOffset(tokenStart);
00150         t->setEndOffset(tokenStart+sb->length());
00151         t->setType(tokenImage[tokenCode]);
00152         sb->getBuffer(); //null terminates the buffer
00153         t->resetTermTextLen();
00154         return true;
00155     }
00156 
00157     bool CLuceneTokenizer::next(Token* t) {
00158         int ch=0;
00159         while (!EOS) {
00160             ch = readChar();
00161 
00162             if ( ch == 0 || ch == -1 ){
00163                 continue;
00164             } else if (SPACE || UNDERSCORE) {
00165                 continue;
00166             } else if (ALPHA) {
00167                 tokenStart = rdPos;
00168                 return ReadAlphaNum(ch,t);
00169             } else if (DIGIT || NEGATIVE_SIGN_ || DECIMAL) {
00170                 tokenStart = rdPos;
00171                 /* ReadNumber returns NULL if it fails to extract a valid number; in
00172                 ** that case, we just continue. */
00173                 if (ReadNumber(NULL, ch,t))
00174                     return true;
00175             } else if ( _CJK ){
00176                 if ( ReadCJK(ch,t) )
00177                     return true;
00178             }
00179         }
00180         return false;
00181     }
00182 
00183     bool CLuceneTokenizer::ReadNumber(const TCHAR* previousNumber, const TCHAR prev,Token* t) {
00184         /* previousNumber is only non-NULL if this function already read a complete
00185         ** number in a previous recursion, yet has been asked to read additional
00186         ** numeric segments.  For example, in the HOST "192.168.1.3", "192.168" is
00187         ** a complete number, but this function will recurse to read the "1.3",
00188         ** generating a single HOST token "192.168.1.3". */
00189         t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word
00190         StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
00191         TokenTypes tokenType;
00192         bool decExhausted;
00193         if (previousNumber != NULL) {
00194             str.prepend(previousNumber);
00195             tokenType = HOST;
00196             decExhausted = false;
00197         } else {
00198             tokenType = NUM;
00199             decExhausted = (prev == '.');
00200         }
00201         if (  str.len >= LUCENE_MAX_WORD_LEN ){
00202             //if a number is too long, i would say there is no point
00203             //storing it, because its going to be the wrong number anyway?
00204             //what do people think?
00205             return false;
00206         }
00207         str.appendChar(prev);
00208 
00209         const bool signExhausted = (prev == '-');
00210         int ch = prev;
00211 
00212         CONSUME_DIGITS;
00213 
00214         if (str.len < 2 /* CONSUME_DIGITS didn't find any digits. */
00215             && (
00216                 (signExhausted && !DECIMAL)
00217                 || (decExhausted /* && !DIGIT is implied, since CONSUME_DIGITS stopped on a non-digit. */)
00218                 )
00219             )
00220         {
00221             /* We have either:
00222             **   a) a negative sign that's not followed by either digit(s) or a decimal
00223             **   b) a decimal that's not followed by digit(s)
00224             ** so this is not a valid number. */
00225             if (!EOS) {
00226                 /* Unread the character that stopped CONSUME_DIGITS: */
00227                 unReadChar();
00228             }
00229             return false;
00230         }
00231 
00232         /* We just read a group of digits.  Is it followed by a decimal symbol,
00233         ** implying that there might be another group of digits available? */
00234         if (!EOS) {
00235             if (DECIMAL) {
00236                 if (  str.len >= LUCENE_MAX_WORD_LEN )
00237                     return false; //read above for rationale
00238                 str.appendChar(ch);
00239             } else {
00240                 unReadChar();
00241                 goto SUCCESSFULLY_EXTRACTED_NUMBER;
00242             }
00243 
00244             CONSUME_DIGITS;
00245             if (!DIGIT && !DECIMAL) {
00246                 unReadChar();
00247             } else if (!EOS && DECIMAL && _istdigit(rd->Peek())) {
00248                 /* We just read the fractional digit group, but it's also followed by
00249                 ** a decimal symbol and at least one more digit, so this must be a
00250                 ** HOST rather than a real number. */
00251                 return ReadNumber(str.getBuffer(), '.',t);
00252             }
00253         }
00254 
00255     SUCCESSFULLY_EXTRACTED_NUMBER:
00256         TCHAR rightmost = RIGHTMOST(str);
00257         /* Don't including a trailing decimal point. */
00258         if (rightmost == '.') {
00259             SHAVE_RIGHTMOST(str);
00260             unReadChar();
00261             rightmost = RIGHTMOST(str);
00262         }
00263         /* If all we have left is a negative sign, it's not a valid number. */
00264         if (rightmost == '-') {
00265             CND_PRECONDITION (str.len == 1, "Number is invalid");
00266             return false;
00267         }
00268 
00269         return setToken(t,&str,tokenType);
00270     }
00271 
00272     bool CLuceneTokenizer::ReadAlphaNum(const TCHAR prev, Token* t) {
00273         t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word
00274         StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
00275         if (  str.len < LUCENE_MAX_WORD_LEN ){
00276             str.appendChar(prev);
00277             int ch = prev;
00278 
00279             CONSUME_WORD;
00280             if (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) { //still have space for 1 more character?
00281                 switch(ch) { /* What follows the first alphanum segment? */
00282 #if 0
00283                 case '.':
00284                     str.appendChar('.');
00285                     return ReadDotted(&str, UNKNOWN,t);
00286 #endif
00287                 case '\'':
00288                     str.appendChar('\'');
00289                     return ReadApostrophe(&str,t);
00290                 case '@':
00291                     str.appendChar('@');
00292                     return ReadAt(&str,t);
00293                 case '&':
00294                     str.appendChar('&');
00295                     return ReadCompany(&str,t);
00296                     /* default: fall through to end of this function. */
00297                 }
00298             }
00299         }
00300         return setToken(t,&str,ALPHANUM);
00301     }
00302 
00303     bool CLuceneTokenizer::ReadCJK(const TCHAR prev, Token* t) {
00304         t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word
00305         StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
00306         if ( str.len < LUCENE_MAX_WORD_LEN ){
00307             str.appendChar(prev);
00308             int ch = prev;
00309 
00310             CONSUME_CJK;
00311         }
00312         return setToken(t,&str,CJK);
00313     }
00314 
00315 
00316     bool CLuceneTokenizer::ReadDotted(StringBuffer* _str, TokenTypes forcedType, CL_NS(analysis)::Token* t) {
00317         const int32_t specialCharPos = rdPos;
00318         StringBuffer& str=*_str;
00319 
00320         /* A segment of a "dotted" is not allowed to begin with another dot or a dash.
00321         ** Even though hosts, e-mail addresses, etc., could have a dotted-segment
00322         ** that begins with a dot or a dash, it's far more common in source text
00323         ** for a pattern like "abc.--def" to be intended as two tokens. */
00324         int ch = rd->Peek();
00325         if (!(DOT || DASH)) {
00326             bool prevWasDot;
00327             bool prevWasDash;
00328             if (str.len == 0) {
00329                 prevWasDot = false;
00330                 prevWasDash = false;
00331             } else {
00332                 prevWasDot = RIGHTMOST(str) == '.';
00333                 prevWasDash = RIGHTMOST(str) == '-';
00334             }
00335             while (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) {
00336                 ch = readChar();
00337                 const bool dot = ch == '.';
00338                 const bool dash = ch == '-';
00339 
00340                 if (!(ALNUM || UNDERSCORE || dot || dash)) {
00341                     break;
00342                 }
00343                 /* Multiple dots or dashes in succession end the token.
00344                 ** Consider the following inputs:
00345                 **   "Visit windowsupdate.microsoft.com--update today!"
00346                 **   "In the U.S.A.--yes, even there!"                 */
00347                 if ((dot || dash) && (prevWasDot || prevWasDash)) {
00348                     /* We're not going to append the character we just read, in any case.
00349                     ** As to the character before it (which is currently RIGHTMOST(str)):
00350                     ** Unless RIGHTMOST(str) is a dot, in which we need to save it so the
00351                     ** acronym-versus-host detection can work, we want to get rid of it. */
00352                     if (!prevWasDot) {
00353                         SHAVE_RIGHTMOST(str);
00354                     }
00355                     break;
00356                 }
00357 
00358                 str.appendChar(ch);
00359 
00360                 prevWasDot = dot;
00361                 prevWasDash = dash;
00362             }
00363         }
00364 
00365         /* There's a potential StringBuffer.append call in the code above, which
00366         ** could cause str to reallocate its internal buffer.  We must wait to
00367         ** obtain the optimization-oriented strBuf pointer until after the initial
00368         ** potentially realloc-triggering operations on str.
00369         ** Because there can be other such ops much later in this function, strBuf
00370         ** is guarded within a block to prevent its use during or after the calls
00371         ** that would potentially invalidate it. */
00372         { /* Begin block-guard of strBuf */
00373             TCHAR* strBuf = str.getBuffer();
00374 
00375             bool rightmostIsDot = RIGHTMOST_IS(str, '.');
00376             if (CONSUMED_NOTHING_OF_VALUE) {
00377                 /* No more alphanums available for this token; shave trailing dot, if any. */
00378                 if (rightmostIsDot) {
00379                     SHAVE_RIGHTMOST(str);
00380                 }
00381                 /* If there are no dots remaining, this is a generic ALPHANUM. */
00382                 if (_tcschr(strBuf, '.') == NULL) {
00383                     forcedType = ALPHANUM;
00384                 }
00385 
00386                 /* Check the token to see if it's an acronym.  An acronym must have a
00387                 ** letter in every even slot and a dot in every odd slot, including the
00388                 ** last slot (for example, "U.S.A."). */
00389             } else if (rightmostIsDot) {
00390                 bool isAcronym = true;
00391                 const int32_t upperCheckLimit = str.len - 1; /* -1 b/c we already checked the last slot. */
00392 
00393                 for (int32_t i = 0; i < upperCheckLimit; i++) {
00394                     const bool even = (i % 2 == 0);
00395                     ch = strBuf[i];
00396                     if ( (even && !ALPHA) || (!even && !DOT) ) {
00397                         isAcronym = false;
00398                         break;
00399                     }
00400                 }
00401                 if (isAcronym) {
00402                     forcedType = ACRONYM;
00403                 } else {
00404                     /* If it's not an acronym, we don't want the trailing dot. */
00405                     SHAVE_RIGHTMOST(str);
00406                     /* If there are no dots remaining, this is a generic ALPHANUM. */
00407                     if (_tcschr(strBuf, '.') == NULL) {
00408                         forcedType = ALPHANUM;
00409                     }
00410                 }
00411             }
00412         } /* End block-guard of strBuf */
00413 
00414         if (!EOS) {
00415             if (ch == '@' && str.len < LUCENE_MAX_WORD_LEN-1) {
00416                 str.appendChar('@');
00417                 return ReadAt(&str,t);
00418             } else {
00419                 unReadChar();
00420             }
00421         }
00422 
00423         return setToken(t,&str,UNKNOWN
00424                         ? forcedType : HOST);
00425     }
00426 
00427     bool CLuceneTokenizer::ReadApostrophe(StringBuffer* _str, Token* t) {
00428         StringBuffer& str=*_str;
00429 
00430         TokenTypes tokenType = APOSTROPHE;
00431         const int32_t specialCharPos = rdPos;
00432         int ch=0;
00433 
00434         CONSUME_ALPHAS;
00435         if (RIGHTMOST_IS(str, '\'') || CONSUMED_NOTHING_OF_VALUE) {
00436             /* After the apostrophe, no more alphanums were available within this
00437             ** token; shave trailing apostrophe and revert to generic ALPHANUM. */
00438             SHAVE_RIGHTMOST(str);
00439             tokenType = ALPHANUM;
00440         }
00441         if (!EOS) {
00442             unReadChar();
00443         }
00444 
00445         return setToken(t,&str,tokenType);
00446     }
00447 
00448     bool CLuceneTokenizer::ReadAt(StringBuffer* str, Token* t) {
00449         ReadDotted(str, EMAIL,t);
00450         /* JLucene grammar indicates dots/digits not allowed in company name: */
00451         if (!CONTAINS_ANY((*str), ".0123456789")) {
00452             setToken(t,str,COMPANY);
00453         }
00454         return true;
00455     }
00456 
00457     bool CLuceneTokenizer::ReadCompany(StringBuffer* _str, Token* t) {
00458         StringBuffer& str = *_str;
00459         const int32_t specialCharPos = rdPos;
00460         int ch=0;
00461 
00462         CONSUME_WORD;
00463         if (CONSUMED_NOTHING_OF_VALUE) {
00464             /* After the ampersand, no more alphanums were available within this
00465             ** token; shave trailing ampersand and revert to ALPHANUM. */
00466             CND_PRECONDITION(RIGHTMOST_IS(str, '&'),"ReadCompany failed");
00467             SHAVE_RIGHTMOST(str);
00468 
00469 
00470             return setToken(t,&str,ALPHANUM);
00471         }
00472         if (!EOS) {
00473             unReadChar();
00474         }
00475 
00476         return setToken(t,&str,COMPANY);
00477     }
00478 }

NepomukDaemons

Skip menu "NepomukDaemons"
  • Main Page
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members
  • Related Pages

API Reference

Skip menu "API Reference"
  • KCMShell
  • KNotify
  • KStyles
  • Nepomuk Daemons
Generated for API Reference by doxygen 1.5.4
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal