• Skip to content
  • Skip to link menu
KDE 4.1 API Reference
  • KDE API Reference
  • API Reference
  • Sitemap
  • Contact Us
 

NepomukDaemons

sopranoindexreader.cpp

Go to the documentation of this file.
00001 /*
00002    Copyright (C) 2007 Sebastian Trueg <trueg@kde.org>
00003 
00004    This library is free software; you can redistribute it and/or
00005    modify it under the terms of the GNU General Public License as
00006    published by the Free Software Foundation; either version 2 of
00007    the License, or (at your option) any later version.
00008 
00009    This library is distributed in the hope that it will be useful,
00010    but WITHOUT ANY WARRANTY; without even the implied warranty of
00011    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00012    Library General Public License for more details.
00013 
00014    You should have received a copy of the GNU General Public License
00015    along with this library; see the file COPYING.  If not, write to
00016    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00017    Boston, MA 02110-1301, USA.
00018 */
00019 
00020 #include "sopranoindexreader.h"
00021 #include "tstring.h"
00022 #include <strigi/query.h>
00023 #include <strigi/queryparser.h>
00024 #include <strigi/fieldtypes.h>
00025 #include "util.h"
00026 
00027 #include <Soprano/Soprano>
00028 #include <Soprano/Index/IndexFilterModel>
00029 #include <Soprano/Index/CLuceneIndex>
00030 #include <Soprano/Vocabulary/XMLSchema>
00031 
00032 #include <map>
00033 #include <utility>
00034 #include <sstream>
00035 
00036 #include <CLucene.h>
00037 
00038 #include <QtCore/QThread>
00039 #include <QtCore/QDateTime>
00040 #include <QtCore/QDebug>
00041 #include <QtCore/QString>
00042 #include <QtCore/QLatin1String>
00043 
00044 
00045 using namespace Soprano;
00046 
00047 
00048 static lucene::index::Term* createWildCardTerm( const TString& name,
00049                                                 const string& value );
00050 static lucene::index::Term* createTerm( const TString& name,
00051                                         const string& value );
00052 static lucene::index::Term* createKeywordTerm( const TString& name,
00053                                                const string& value );
00054 static lucene::search::BooleanQuery* createBooleanQuery( const Strigi::Query& query );
00055 static lucene::search::Query* createQuery( const Strigi::Query& query );
00056 static lucene::search::Query* createSimpleQuery( const Strigi::Query& query );
00057 static lucene::search::Query* createSingleFieldQuery( const string& field,
00058                                                       const Strigi::Query& query );
00059 static lucene::search::Query* createMultiFieldQuery( const Strigi::Query& query );
00060 
00061 
00062 static lucene::index::Term* createWildCardTerm( const TString& name,
00063                                                 const string& value )
00064 {
00065     TString v = TString::fromUtf8( value.c_str() );
00066     return _CLNEW lucene::index::Term( name.data(), v.data() );
00067 }
00068 
00069 static lucene::index::Term* createTerm( const TString& name,
00070                                         const string& value )
00071 {
00072     qDebug() << "createTerm" << name << value.c_str();
00073 
00074     TString v = TString::fromUtf8( value.c_str() );
00075 
00076     lucene::util::StringReader sr( v.data() );
00077     lucene::analysis::standard::StandardAnalyzer a;
00078     lucene::analysis::TokenStream* ts = a.tokenStream(name.data(), &sr);
00079     lucene::analysis::Token* to = ts->next();
00080     const wchar_t *tv;
00081     if (to) {
00082         tv = to->termText();
00083     } else {
00084         tv = v.data();
00085     }
00086     lucene::index::Term* t = _CLNEW lucene::index::Term(name.data(), tv);
00087     if (to) {
00088         _CLDELETE(to);
00089     }
00090     _CLDELETE(ts);
00091     return t;
00092 }
00093 
00094 static lucene::index::Term* createKeywordTerm( const TString& name,
00095                                                const string& value )
00096 {
00097     TString v = TString::fromUtf8( value.c_str() );
00098     lucene::index::Term* t = _CLNEW lucene::index::Term( name.data(), v.data() );
00099     return t;
00100 }
00101 
00102 static lucene::search::BooleanQuery* createBooleanQuery( const Strigi::Query& query )
00103 {
00104     lucene::search::BooleanQuery* bq = _CLNEW lucene::search::BooleanQuery();
00105     bool isAnd = query.type() == Strigi::Query::And;
00106     const vector<Strigi::Query>& sub = query.subQueries();
00107     for (vector<Strigi::Query>::const_iterator i = sub.begin(); i != sub.end(); ++i) {
00108         lucene::search::Query* q = createQuery(*i);
00109         bq->add(q, true, isAnd, i->negate());
00110     }
00111     return bq;
00112 }
00113 
00114 static lucene::search::Query* createQuery( const Strigi::Query& query )
00115 {
00116     return query.subQueries().size()
00117         ? createBooleanQuery(query)
00118         : createSimpleQuery(query);
00119 }
00120 
00121 static lucene::search::Query* createSimpleQuery( const Strigi::Query& query )
00122 {
00123     switch (query.fields().size()) {
00124     case 0:  return createSingleFieldQuery("text", query);
00125     case 1:  return createSingleFieldQuery(query.fields()[0], query);
00126     default: return createMultiFieldQuery(query);
00127     }
00128 }
00129 
00130 static lucene::search::Query* createSingleFieldQuery( const string& field,
00131                                                       const Strigi::Query& query ) {
00132     qDebug() << "Creating single field query: " << field.c_str();
00133     TString fieldname = Strigi::Soprano::Util::convertSearchField( field );
00134     lucene::search::Query* q;
00135     lucene::index::Term* t;
00136     const string& val = query.term().string();
00137     switch (query.type()) {
00138     case Strigi::Query::LessThan:
00139           t = createTerm(fieldname, val.c_str());
00140           q = _CLNEW lucene::search::RangeQuery(0, t, false);
00141           break;
00142     case Strigi::Query::LessThanEquals:
00143           t = createTerm(fieldname, query.term().string());
00144           q = _CLNEW lucene::search::RangeQuery(0, t, true);
00145           break;
00146     case Strigi::Query::GreaterThan:
00147           t = createTerm(fieldname, query.term().string());
00148           q = _CLNEW lucene::search::RangeQuery(t, 0, false);
00149           break;
00150     case Strigi::Query::GreaterThanEquals:
00151           t = createTerm(fieldname, query.term().string());
00152           q = _CLNEW lucene::search::RangeQuery(t, 0, true);
00153           break;
00154     case Strigi::Query::Keyword:
00155           t = createKeywordTerm(fieldname, query.term().string());
00156           q = _CLNEW lucene::search::TermQuery(t);
00157           break;
00158     default:
00159           if (strpbrk(val.c_str(), "*?")) {
00160                t = createWildCardTerm(fieldname, val);
00161                q = _CLNEW lucene::search::WildcardQuery(t);
00162           } else {
00163                t = createTerm(fieldname, val);
00164                q = _CLNEW lucene::search::TermQuery(t);
00165           }
00166     }
00167     _CLDECDELETE(t);
00168     return q;
00169 }
00170 
00171 static lucene::search::Query* createMultiFieldQuery( const Strigi::Query& query )
00172 {
00173     lucene::search::BooleanQuery* bq = _CLNEW lucene::search::BooleanQuery();
00174     for (vector<string>::const_iterator i = query.fields().begin();
00175             i != query.fields().end(); ++i) {
00176         lucene::search::Query* q = createSingleFieldQuery(*i, query);
00177         bq->add(q, true, false, false);
00178     }
00179     return bq;
00180 }
00181 
00182 
00183 static QString escapeLiteralForSparqlQuery( const QString& s )
00184 {
00185     return QString( s ).replace( '\\', "\\\\" ).replace( '\"', "\\\"" );
00186 }
00187 
00188 
00189 class Strigi::Soprano::IndexReader::Private
00190 {
00191 public:
00192     bool createDocument( const Node& res, IndexedDocument& doc ) {
00193         StatementIterator it = repository->listStatements( Statement( res, Node(), Node() ) );
00194         if ( it.lastError() ) {
00195             return false;
00196         }
00197 
00198         // use the resource URI as fallback file URI
00199         doc.uri = res.uri().toLocalFile().toUtf8().data();
00200 
00201         while ( it.next() ) {
00202             Statement s = *it;
00203             if ( s.object().isLiteral() ) {
00204                 std::string fieldName = Util::fieldName( s.predicate().uri() );
00205                 std::string value = s.object().toString().toUtf8().data();
00206 
00207                 if (fieldName == "text") {
00208                     doc.fragment = value;
00209                 }
00210                 else if (fieldName == FieldRegister::pathFieldName) {
00211                     qDebug() << "Setting IndexedDocument uri=" << value.c_str();
00212                     doc.uri = value;
00213                 }
00214                 else if (fieldName == FieldRegister::mimetypeFieldName) {
00215                     doc.mimetype = value;
00216                 }
00217                 else if (fieldName == FieldRegister::mtimeFieldName) {
00218                     // FIXME: Sadly in Xesam sourceModified is not typed as DateTime but defaults to an int :( We try to be compatible
00219                     if ( s.object().literal().isDateTime() ) {
00220                         doc.mtime = s.object().literal().toDateTime().toTime_t();
00221                     }
00222                     else {
00223                         doc.mtime = s.object().literal().toUnsignedInt();
00224                     }
00225                 }
00226                 else if (fieldName == FieldRegister::sizeFieldName) {
00227                     doc.size = s.object().literal().toInt64();
00228                 }
00229                 else {
00230                     doc.properties.insert( make_pair<const string, string>( fieldName, value ) );
00231                 }
00232             }
00233             else {
00234                 // FIXME: For "Strigi++" we should at least go one level deeper, i.e. make an RDF query on those results that are
00235                 // not literal statements
00236             }
00237         }
00238 
00239         return true;
00240     }
00241 
00242 //    ::Soprano::Index::IndexFilterModel* repository;
00243     ::Soprano::Model* repository;
00244 };
00245 
00246 
00247 Strigi::Soprano::IndexReader::IndexReader( ::Soprano::Model* model )
00248     : Strigi::IndexReader()
00249 {
00250     qDebug() << "IndexReader::IndexReader in thread" << QThread::currentThread();
00251     d = new Private;
00252     d->repository = model;
00253 }
00254 
00255 
00256 Strigi::Soprano::IndexReader::~IndexReader()
00257 {
00258     qDebug() << "IndexReader::~IndexReader in thread" << QThread::currentThread();
00259     delete d;
00260 }
00261 
00262 
00263 int32_t Strigi::Soprano::IndexReader::countHits( const Query& query )
00264 {
00265     qDebug() << "IndexReader::countHits in thread" << QThread::currentThread();
00266 
00267     lucene::search::Query* q = createQuery( query );
00268     ::Soprano::QueryResultIterator hits = d->repository->executeQuery( TString( q->toString(), true ),
00269                                                                        ::Soprano::Query::QUERY_LANGUAGE_USER,
00270                                                                        QLatin1String( "lucene" ) );
00271 //    Iterator< ::Soprano::Index::QueryHit> hits = d->repository->index()->search( q );
00272     int s = 0;
00273     while ( hits.next() ) {
00274         qDebug() << "Query hit:" << hits.binding( 0 );
00275         ++s;
00276     }
00277     _CLDELETE(q);
00278     return s;
00279 }
00280 
00281 
00282 void Strigi::Soprano::IndexReader::getHits( const Strigi::Query& query,
00283                                             const std::vector<std::string>& fields,
00284                                             const std::vector<Strigi::Variant::Type>& types,
00285                                             std::vector<std::vector<Strigi::Variant> >& result,
00286                                             int off, int max )
00287 {
00288     qDebug() << "IndexReader::getHits in thread" << QThread::currentThread();
00289     lucene::search::Query* bq = createQuery( query );
00290     ::Soprano::QueryResultIterator hits = d->repository->executeQuery( TString( bq->toString(), true ),
00291                                                                        ::Soprano::Query::QUERY_LANGUAGE_USER,
00292                                                                        QLatin1String( "lucene" ) );
00293 //    Iterator< ::Soprano::Index::QueryHit> hits = d->repository->index()->search( bq );
00294 
00295     int i = -1;
00296     while ( hits.next() ) {
00297         ++i;
00298         if ( i < off ) {
00299             continue;
00300         }
00301         if ( i > max ) {
00302             break;
00303         }
00304 
00305 //        ::Soprano::Index::QueryHit hit = *hits;
00306         std::vector<Strigi::Variant> resultRow;
00307         std::vector<std::string>::const_iterator fieldIt = fields.begin();
00308         std::vector<Strigi::Variant::Type>::const_iterator typesIt = types.begin();
00309         while ( fieldIt != fields.end() ) {
00310             if ( typesIt == types.end() ) {
00311                 qFatal( "(Soprano::IndexReader) Invalid types list in getHits!" );
00312                 return;
00313             }
00314 
00315             StatementIterator it = d->repository->listStatements( Statement( hits.binding( "resource" ),
00316                                                                              Util::fieldUri( *fieldIt ),
00317                                                                              Node() ) );
00318             // FIXME: what if we have a field with a cardinality > 1?
00319             if ( it.next() ) {
00320                 resultRow.push_back( Util::nodeToVariant( it.current().object() ) );
00321             }
00322             else {
00323                 resultRow.push_back( Strigi::Variant() );
00324             }
00325 
00326             ++fieldIt;
00327             ++typesIt;
00328         }
00329 
00330         result.push_back( resultRow );
00331     }
00332     _CLDELETE(bq);
00333 }
00334 
00335 
00336 std::vector<Strigi::IndexedDocument> Strigi::Soprano::IndexReader::query( const Query& query, int off, int max )
00337 {
00338     qDebug() << "IndexReader::query in thread" << QThread::currentThread();
00339     vector<IndexedDocument> results;
00340     lucene::search::Query* bq = createQuery( query );
00341     ::Soprano::QueryResultIterator hits = d->repository->executeQuery( TString( bq->toString(), true ),
00342                                                                        ::Soprano::Query::QUERY_LANGUAGE_USER,
00343                                                                        QLatin1String( "lucene" ) );
00344 //    Iterator< ::Soprano::Index::QueryHit> hits = d->repository->index()->search( bq );
00345 
00346     int i = -1;
00347     while ( hits.next() ) {
00348         ++i;
00349         if ( i < off ) {
00350             continue;
00351         }
00352         if ( i > max ) {
00353             break;
00354         }
00355 
00356         IndexedDocument result;
00357 //        ::Soprano::Index::QueryHit hit = *hits;
00358         result.score = hits.binding( 1 ).literal().toDouble();
00359         if ( d->createDocument( hits.binding( 0 ), result ) ) {
00360             results.push_back( result );
00361         }
00362         else {
00363             qDebug() << "Failed to create indexed document for resource " << hits.binding( 0 ) << ": " << d->repository->lastError();
00364         }
00365     }
00366     _CLDELETE(bq);
00367     return results;
00368 }
00369 
00370 
00371 // an empty parent url is perfectly valid as strigi stores a parent url for everything
00372 void Strigi::Soprano::IndexReader::getChildren( const std::string& parent,
00373                                                 std::map<std::string, time_t>& children )
00374 {
00375 //    qDebug() << "IndexReader::getChildren in thread" << QThread::currentThread();
00376     QString query = QString( "select distinct ?path ?mtime where { ?r <%1> \"%2\"^^<%3> . ?r <%4> ?mtime . ?r <%5> ?path . }")
00377                     .arg( Util::fieldUri( FieldRegister::parentLocationFieldName ).toString() )
00378                     .arg( escapeLiteralForSparqlQuery( QString::fromUtf8( parent.c_str() ) ) )
00379                     .arg( Vocabulary::XMLSchema::string().toString() )
00380                     .arg( Util::fieldUri( FieldRegister::mtimeFieldName ).toString() )
00381                     .arg( Util::fieldUri( FieldRegister::pathFieldName ).toString() );
00382 
00383     qDebug() << "running getChildren query:" << query;
00384 
00385     QueryResultIterator result = d->repository->executeQuery( query, ::Soprano::Query::QUERY_LANGUAGE_SPARQL );
00386 
00387     while ( result.next() ) {
00388         Node pathNode = result.binding( "path" );
00389         Node mTimeNode = result.binding( "mtime" );
00390 //        qDebug() << "file in index: " << pathNode.toString() << "mtime:" << mTimeNode.literal().toDateTime() << "(" << mTimeNode.literal().toDateTime().toTime_t() << ")";
00391 
00392         // FIXME: Sadly in Xesam sourceModified is not typed as DateTime but defaults to an int :( We try to be compatible
00393         if ( mTimeNode.literal().isDateTime() ) {
00394             children[std::string( pathNode.toString().toUtf8().data() )] = mTimeNode.literal().toDateTime().toTime_t();
00395         }
00396         else {
00397             children[std::string( pathNode.toString().toUtf8().data() )] = mTimeNode.literal().toUnsignedInt();
00398         }
00399     }
00400 }
00401 
00402 
00403 int32_t Strigi::Soprano::IndexReader::countDocuments()
00404 {
00405     qDebug() << "IndexReader::countDocuments in thread" << QThread::currentThread();
00406     // FIXME: the only solution I see ATM is: select distinct ?r where { ?r ?p ?o }
00407     return 0;
00408 }
00409 
00410 
00411 int32_t Strigi::Soprano::IndexReader::countWords()
00412 {
00413     qDebug() << "IndexReader::countWords in thread" << QThread::currentThread();
00414     // FIXME: what to do here? use the index? Count the predicates?
00415     return -1;
00416 }
00417 
00418 
00419 int64_t Strigi::Soprano::IndexReader::indexSize()
00420 {
00421     qDebug() << "IndexReader::indexSize in thread" << QThread::currentThread();
00422     return d->repository->statementCount();
00423 }
00424 
00425 
00426 time_t Strigi::Soprano::IndexReader::mTime( const std::string& uri )
00427 {
00428 //    qDebug() << "IndexReader::mTime in thread" << QThread::currentThread();
00429     QString query = QString( "select ?mtime where { ?r <%2> \"%3\"^^<%4> . ?r <%1> ?mtime . }" )
00430                     .arg( Util::fieldUri( FieldRegister::mtimeFieldName ).toString() )
00431                     .arg( Util::fieldUri( FieldRegister::pathFieldName ).toString() )
00432                     .arg( escapeLiteralForSparqlQuery( QString::fromUtf8( uri.c_str() ) ) )
00433                     .arg( Vocabulary::XMLSchema::string().toString() );
00434 
00435     qDebug() << "mTime( " << uri.c_str() << ") query:" << query;
00436 
00437     QueryResultIterator it = d->repository->executeQuery( query, ::Soprano::Query::QUERY_LANGUAGE_SPARQL );
00438 
00439     time_t mtime = 0;
00440     if ( it.next() ) {
00441         ::Soprano::LiteralValue val = it.binding( "mtime" ).literal();
00442 
00443         // FIXME: Sadly in Xesam sourceModified is not typed as DateTime but defaults to an int :( We try to be compatible
00444         if ( val.isDateTime() ) {
00445             mtime = val.toDateTime().toTime_t();
00446         }
00447         else {
00448             mtime = val.toUnsignedInt();
00449         }
00450     }
00451     return mtime;
00452 }
00453 
00454 
00455 std::vector<std::string> Strigi::Soprano::IndexReader::fieldNames()
00456 {
00457     qDebug() << "IndexReader::fieldNames in thread" << QThread::currentThread();
00458     // This is a weird method
00459     // Our list of field names (the predicates) is probably awefully long.
00460 
00461     std::vector<std::string> fields;
00462     QueryResultIterator it = d->repository->executeQuery( "select distinct ?p where { ?r ?p ?o . }", ::Soprano::Query::QUERY_LANGUAGE_SPARQL );
00463     while ( it.next() ) {
00464         fields.push_back( Util::fieldName( it.binding("p").uri() ) );
00465     }
00466     return fields;
00467 }
00468 
00469 
00470 std::vector<std::pair<std::string,uint32_t> > Strigi::Soprano::IndexReader::histogram( const std::string& query,
00471                                                                                        const std::string& fieldname,
00472                                                                                        const std::string& labeltype )
00473 {
00474     Q_UNUSED(query);
00475     Q_UNUSED(fieldname);
00476     Q_UNUSED(labeltype);
00477 
00478     // FIXME: what is meant by fieldname and labeltype?
00479     qDebug() << "IndexReader::histogram in thread" << QThread::currentThread();
00480     // IMPLEMENTME? Seems not like a very important method though.
00481     return std::vector<std::pair<std::string,uint32_t> >();
00482 }
00483 
00484 
00485 int32_t Strigi::Soprano::IndexReader::countKeywords( const std::string& keywordprefix,
00486                                                      const std::vector<std::string>& fieldnames)
00487 {
00488     Q_UNUSED(keywordprefix);
00489     Q_UNUSED(fieldnames);
00490 
00491     qDebug() << "IndexReader::countKeywords in thread" << QThread::currentThread();
00492     // the clucene indexer also returns 2. I suspect this means: "not implemented" ;)
00493     return 2;
00494 }
00495 
00496 
00497 std::vector<std::string> Strigi::Soprano::IndexReader::keywords( const std::string& keywordmatch,
00498                                                                  const std::vector<std::string>& fieldnames,
00499                                                                  uint32_t max, uint32_t offset )
00500 {
00501     Q_UNUSED(keywordmatch);
00502     Q_UNUSED(fieldnames);
00503     Q_UNUSED(max);
00504     Q_UNUSED(offset);
00505 
00506     qDebug() << "IndexReader::keywords in thread" << QThread::currentThread();
00507     // IMPLEMENTME? Seems like a rarely used method...
00508     return std::vector<std::string>();
00509 }

NepomukDaemons

Skip menu "NepomukDaemons"
  • Main Page
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members
  • Related Pages

API Reference

Skip menu "API Reference"
  • KCMShell
  • KNotify
  • KStyles
  • Nepomuk Daemons
Generated for API Reference by doxygen 1.5.4
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal