00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include "sopranoindexreader.h"
00021 #include "tstring.h"
00022 #include <strigi/query.h>
00023 #include <strigi/queryparser.h>
00024 #include <strigi/fieldtypes.h>
00025 #include "util.h"
00026
00027 #include <Soprano/Soprano>
00028 #include <Soprano/Index/IndexFilterModel>
00029 #include <Soprano/Index/CLuceneIndex>
00030 #include <Soprano/Vocabulary/XMLSchema>
00031
00032 #include <map>
00033 #include <utility>
00034 #include <sstream>
00035
00036 #include <CLucene.h>
00037
00038 #include <QtCore/QThread>
00039 #include <QtCore/QDateTime>
00040 #include <QtCore/QDebug>
00041 #include <QtCore/QString>
00042 #include <QtCore/QLatin1String>
00043
00044
00045 using namespace Soprano;
00046
00047
00048 static lucene::index::Term* createWildCardTerm( const TString& name,
00049 const string& value );
00050 static lucene::index::Term* createTerm( const TString& name,
00051 const string& value );
00052 static lucene::index::Term* createKeywordTerm( const TString& name,
00053 const string& value );
00054 static lucene::search::BooleanQuery* createBooleanQuery( const Strigi::Query& query );
00055 static lucene::search::Query* createQuery( const Strigi::Query& query );
00056 static lucene::search::Query* createSimpleQuery( const Strigi::Query& query );
00057 static lucene::search::Query* createSingleFieldQuery( const string& field,
00058 const Strigi::Query& query );
00059 static lucene::search::Query* createMultiFieldQuery( const Strigi::Query& query );
00060
00061
00062 static lucene::index::Term* createWildCardTerm( const TString& name,
00063 const string& value )
00064 {
00065 TString v = TString::fromUtf8( value.c_str() );
00066 return _CLNEW lucene::index::Term( name.data(), v.data() );
00067 }
00068
00069 static lucene::index::Term* createTerm( const TString& name,
00070 const string& value )
00071 {
00072 qDebug() << "createTerm" << name << value.c_str();
00073
00074 TString v = TString::fromUtf8( value.c_str() );
00075
00076 lucene::util::StringReader sr( v.data() );
00077 lucene::analysis::standard::StandardAnalyzer a;
00078 lucene::analysis::TokenStream* ts = a.tokenStream(name.data(), &sr);
00079 lucene::analysis::Token* to = ts->next();
00080 const wchar_t *tv;
00081 if (to) {
00082 tv = to->termText();
00083 } else {
00084 tv = v.data();
00085 }
00086 lucene::index::Term* t = _CLNEW lucene::index::Term(name.data(), tv);
00087 if (to) {
00088 _CLDELETE(to);
00089 }
00090 _CLDELETE(ts);
00091 return t;
00092 }
00093
00094 static lucene::index::Term* createKeywordTerm( const TString& name,
00095 const string& value )
00096 {
00097 TString v = TString::fromUtf8( value.c_str() );
00098 lucene::index::Term* t = _CLNEW lucene::index::Term( name.data(), v.data() );
00099 return t;
00100 }
00101
00102 static lucene::search::BooleanQuery* createBooleanQuery( const Strigi::Query& query )
00103 {
00104 lucene::search::BooleanQuery* bq = _CLNEW lucene::search::BooleanQuery();
00105 bool isAnd = query.type() == Strigi::Query::And;
00106 const vector<Strigi::Query>& sub = query.subQueries();
00107 for (vector<Strigi::Query>::const_iterator i = sub.begin(); i != sub.end(); ++i) {
00108 lucene::search::Query* q = createQuery(*i);
00109 bq->add(q, true, isAnd, i->negate());
00110 }
00111 return bq;
00112 }
00113
00114 static lucene::search::Query* createQuery( const Strigi::Query& query )
00115 {
00116 return query.subQueries().size()
00117 ? createBooleanQuery(query)
00118 : createSimpleQuery(query);
00119 }
00120
00121 static lucene::search::Query* createSimpleQuery( const Strigi::Query& query )
00122 {
00123 switch (query.fields().size()) {
00124 case 0: return createSingleFieldQuery("text", query);
00125 case 1: return createSingleFieldQuery(query.fields()[0], query);
00126 default: return createMultiFieldQuery(query);
00127 }
00128 }
00129
00130 static lucene::search::Query* createSingleFieldQuery( const string& field,
00131 const Strigi::Query& query ) {
00132 qDebug() << "Creating single field query: " << field.c_str();
00133 TString fieldname = Strigi::Soprano::Util::convertSearchField( field );
00134 lucene::search::Query* q;
00135 lucene::index::Term* t;
00136 const string& val = query.term().string();
00137 switch (query.type()) {
00138 case Strigi::Query::LessThan:
00139 t = createTerm(fieldname, val.c_str());
00140 q = _CLNEW lucene::search::RangeQuery(0, t, false);
00141 break;
00142 case Strigi::Query::LessThanEquals:
00143 t = createTerm(fieldname, query.term().string());
00144 q = _CLNEW lucene::search::RangeQuery(0, t, true);
00145 break;
00146 case Strigi::Query::GreaterThan:
00147 t = createTerm(fieldname, query.term().string());
00148 q = _CLNEW lucene::search::RangeQuery(t, 0, false);
00149 break;
00150 case Strigi::Query::GreaterThanEquals:
00151 t = createTerm(fieldname, query.term().string());
00152 q = _CLNEW lucene::search::RangeQuery(t, 0, true);
00153 break;
00154 case Strigi::Query::Keyword:
00155 t = createKeywordTerm(fieldname, query.term().string());
00156 q = _CLNEW lucene::search::TermQuery(t);
00157 break;
00158 default:
00159 if (strpbrk(val.c_str(), "*?")) {
00160 t = createWildCardTerm(fieldname, val);
00161 q = _CLNEW lucene::search::WildcardQuery(t);
00162 } else {
00163 t = createTerm(fieldname, val);
00164 q = _CLNEW lucene::search::TermQuery(t);
00165 }
00166 }
00167 _CLDECDELETE(t);
00168 return q;
00169 }
00170
00171 static lucene::search::Query* createMultiFieldQuery( const Strigi::Query& query )
00172 {
00173 lucene::search::BooleanQuery* bq = _CLNEW lucene::search::BooleanQuery();
00174 for (vector<string>::const_iterator i = query.fields().begin();
00175 i != query.fields().end(); ++i) {
00176 lucene::search::Query* q = createSingleFieldQuery(*i, query);
00177 bq->add(q, true, false, false);
00178 }
00179 return bq;
00180 }
00181
00182
00183 static QString escapeLiteralForSparqlQuery( const QString& s )
00184 {
00185 return QString( s ).replace( '\\', "\\\\" ).replace( '\"', "\\\"" );
00186 }
00187
00188
00189 class Strigi::Soprano::IndexReader::Private
00190 {
00191 public:
00192 bool createDocument( const Node& res, IndexedDocument& doc ) {
00193 StatementIterator it = repository->listStatements( Statement( res, Node(), Node() ) );
00194 if ( it.lastError() ) {
00195 return false;
00196 }
00197
00198
00199 doc.uri = res.uri().toLocalFile().toUtf8().data();
00200
00201 while ( it.next() ) {
00202 Statement s = *it;
00203 if ( s.object().isLiteral() ) {
00204 std::string fieldName = Util::fieldName( s.predicate().uri() );
00205 std::string value = s.object().toString().toUtf8().data();
00206
00207 if (fieldName == "text") {
00208 doc.fragment = value;
00209 }
00210 else if (fieldName == FieldRegister::pathFieldName) {
00211 qDebug() << "Setting IndexedDocument uri=" << value.c_str();
00212 doc.uri = value;
00213 }
00214 else if (fieldName == FieldRegister::mimetypeFieldName) {
00215 doc.mimetype = value;
00216 }
00217 else if (fieldName == FieldRegister::mtimeFieldName) {
00218
00219 if ( s.object().literal().isDateTime() ) {
00220 doc.mtime = s.object().literal().toDateTime().toTime_t();
00221 }
00222 else {
00223 doc.mtime = s.object().literal().toUnsignedInt();
00224 }
00225 }
00226 else if (fieldName == FieldRegister::sizeFieldName) {
00227 doc.size = s.object().literal().toInt64();
00228 }
00229 else {
00230 doc.properties.insert( make_pair<const string, string>( fieldName, value ) );
00231 }
00232 }
00233 else {
00234
00235
00236 }
00237 }
00238
00239 return true;
00240 }
00241
00242
00243 ::Soprano::Model* repository;
00244 };
00245
00246
00247 Strigi::Soprano::IndexReader::IndexReader( ::Soprano::Model* model )
00248 : Strigi::IndexReader()
00249 {
00250 qDebug() << "IndexReader::IndexReader in thread" << QThread::currentThread();
00251 d = new Private;
00252 d->repository = model;
00253 }
00254
00255
00256 Strigi::Soprano::IndexReader::~IndexReader()
00257 {
00258 qDebug() << "IndexReader::~IndexReader in thread" << QThread::currentThread();
00259 delete d;
00260 }
00261
00262
00263 int32_t Strigi::Soprano::IndexReader::countHits( const Query& query )
00264 {
00265 qDebug() << "IndexReader::countHits in thread" << QThread::currentThread();
00266
00267 lucene::search::Query* q = createQuery( query );
00268 ::Soprano::QueryResultIterator hits = d->repository->executeQuery( TString( q->toString(), true ),
00269 ::Soprano::Query::QUERY_LANGUAGE_USER,
00270 QLatin1String( "lucene" ) );
00271
00272 int s = 0;
00273 while ( hits.next() ) {
00274 qDebug() << "Query hit:" << hits.binding( 0 );
00275 ++s;
00276 }
00277 _CLDELETE(q);
00278 return s;
00279 }
00280
00281
00282 void Strigi::Soprano::IndexReader::getHits( const Strigi::Query& query,
00283 const std::vector<std::string>& fields,
00284 const std::vector<Strigi::Variant::Type>& types,
00285 std::vector<std::vector<Strigi::Variant> >& result,
00286 int off, int max )
00287 {
00288 qDebug() << "IndexReader::getHits in thread" << QThread::currentThread();
00289 lucene::search::Query* bq = createQuery( query );
00290 ::Soprano::QueryResultIterator hits = d->repository->executeQuery( TString( bq->toString(), true ),
00291 ::Soprano::Query::QUERY_LANGUAGE_USER,
00292 QLatin1String( "lucene" ) );
00293
00294
00295 int i = -1;
00296 while ( hits.next() ) {
00297 ++i;
00298 if ( i < off ) {
00299 continue;
00300 }
00301 if ( i > max ) {
00302 break;
00303 }
00304
00305
00306 std::vector<Strigi::Variant> resultRow;
00307 std::vector<std::string>::const_iterator fieldIt = fields.begin();
00308 std::vector<Strigi::Variant::Type>::const_iterator typesIt = types.begin();
00309 while ( fieldIt != fields.end() ) {
00310 if ( typesIt == types.end() ) {
00311 qFatal( "(Soprano::IndexReader) Invalid types list in getHits!" );
00312 return;
00313 }
00314
00315 StatementIterator it = d->repository->listStatements( Statement( hits.binding( "resource" ),
00316 Util::fieldUri( *fieldIt ),
00317 Node() ) );
00318
00319 if ( it.next() ) {
00320 resultRow.push_back( Util::nodeToVariant( it.current().object() ) );
00321 }
00322 else {
00323 resultRow.push_back( Strigi::Variant() );
00324 }
00325
00326 ++fieldIt;
00327 ++typesIt;
00328 }
00329
00330 result.push_back( resultRow );
00331 }
00332 _CLDELETE(bq);
00333 }
00334
00335
00336 std::vector<Strigi::IndexedDocument> Strigi::Soprano::IndexReader::query( const Query& query, int off, int max )
00337 {
00338 qDebug() << "IndexReader::query in thread" << QThread::currentThread();
00339 vector<IndexedDocument> results;
00340 lucene::search::Query* bq = createQuery( query );
00341 ::Soprano::QueryResultIterator hits = d->repository->executeQuery( TString( bq->toString(), true ),
00342 ::Soprano::Query::QUERY_LANGUAGE_USER,
00343 QLatin1String( "lucene" ) );
00344
00345
00346 int i = -1;
00347 while ( hits.next() ) {
00348 ++i;
00349 if ( i < off ) {
00350 continue;
00351 }
00352 if ( i > max ) {
00353 break;
00354 }
00355
00356 IndexedDocument result;
00357
00358 result.score = hits.binding( 1 ).literal().toDouble();
00359 if ( d->createDocument( hits.binding( 0 ), result ) ) {
00360 results.push_back( result );
00361 }
00362 else {
00363 qDebug() << "Failed to create indexed document for resource " << hits.binding( 0 ) << ": " << d->repository->lastError();
00364 }
00365 }
00366 _CLDELETE(bq);
00367 return results;
00368 }
00369
00370
00371
00372 void Strigi::Soprano::IndexReader::getChildren( const std::string& parent,
00373 std::map<std::string, time_t>& children )
00374 {
00375
00376 QString query = QString( "select distinct ?path ?mtime where { ?r <%1> \"%2\"^^<%3> . ?r <%4> ?mtime . ?r <%5> ?path . }")
00377 .arg( Util::fieldUri( FieldRegister::parentLocationFieldName ).toString() )
00378 .arg( escapeLiteralForSparqlQuery( QString::fromUtf8( parent.c_str() ) ) )
00379 .arg( Vocabulary::XMLSchema::string().toString() )
00380 .arg( Util::fieldUri( FieldRegister::mtimeFieldName ).toString() )
00381 .arg( Util::fieldUri( FieldRegister::pathFieldName ).toString() );
00382
00383 qDebug() << "running getChildren query:" << query;
00384
00385 QueryResultIterator result = d->repository->executeQuery( query, ::Soprano::Query::QUERY_LANGUAGE_SPARQL );
00386
00387 while ( result.next() ) {
00388 Node pathNode = result.binding( "path" );
00389 Node mTimeNode = result.binding( "mtime" );
00390
00391
00392
00393 if ( mTimeNode.literal().isDateTime() ) {
00394 children[std::string( pathNode.toString().toUtf8().data() )] = mTimeNode.literal().toDateTime().toTime_t();
00395 }
00396 else {
00397 children[std::string( pathNode.toString().toUtf8().data() )] = mTimeNode.literal().toUnsignedInt();
00398 }
00399 }
00400 }
00401
00402
00403 int32_t Strigi::Soprano::IndexReader::countDocuments()
00404 {
00405 qDebug() << "IndexReader::countDocuments in thread" << QThread::currentThread();
00406
00407 return 0;
00408 }
00409
00410
00411 int32_t Strigi::Soprano::IndexReader::countWords()
00412 {
00413 qDebug() << "IndexReader::countWords in thread" << QThread::currentThread();
00414
00415 return -1;
00416 }
00417
00418
00419 int64_t Strigi::Soprano::IndexReader::indexSize()
00420 {
00421 qDebug() << "IndexReader::indexSize in thread" << QThread::currentThread();
00422 return d->repository->statementCount();
00423 }
00424
00425
00426 time_t Strigi::Soprano::IndexReader::mTime( const std::string& uri )
00427 {
00428
00429 QString query = QString( "select ?mtime where { ?r <%2> \"%3\"^^<%4> . ?r <%1> ?mtime . }" )
00430 .arg( Util::fieldUri( FieldRegister::mtimeFieldName ).toString() )
00431 .arg( Util::fieldUri( FieldRegister::pathFieldName ).toString() )
00432 .arg( escapeLiteralForSparqlQuery( QString::fromUtf8( uri.c_str() ) ) )
00433 .arg( Vocabulary::XMLSchema::string().toString() );
00434
00435 qDebug() << "mTime( " << uri.c_str() << ") query:" << query;
00436
00437 QueryResultIterator it = d->repository->executeQuery( query, ::Soprano::Query::QUERY_LANGUAGE_SPARQL );
00438
00439 time_t mtime = 0;
00440 if ( it.next() ) {
00441 ::Soprano::LiteralValue val = it.binding( "mtime" ).literal();
00442
00443
00444 if ( val.isDateTime() ) {
00445 mtime = val.toDateTime().toTime_t();
00446 }
00447 else {
00448 mtime = val.toUnsignedInt();
00449 }
00450 }
00451 return mtime;
00452 }
00453
00454
00455 std::vector<std::string> Strigi::Soprano::IndexReader::fieldNames()
00456 {
00457 qDebug() << "IndexReader::fieldNames in thread" << QThread::currentThread();
00458
00459
00460
00461 std::vector<std::string> fields;
00462 QueryResultIterator it = d->repository->executeQuery( "select distinct ?p where { ?r ?p ?o . }", ::Soprano::Query::QUERY_LANGUAGE_SPARQL );
00463 while ( it.next() ) {
00464 fields.push_back( Util::fieldName( it.binding("p").uri() ) );
00465 }
00466 return fields;
00467 }
00468
00469
00470 std::vector<std::pair<std::string,uint32_t> > Strigi::Soprano::IndexReader::histogram( const std::string& query,
00471 const std::string& fieldname,
00472 const std::string& labeltype )
00473 {
00474 Q_UNUSED(query);
00475 Q_UNUSED(fieldname);
00476 Q_UNUSED(labeltype);
00477
00478
00479 qDebug() << "IndexReader::histogram in thread" << QThread::currentThread();
00480
00481 return std::vector<std::pair<std::string,uint32_t> >();
00482 }
00483
00484
00485 int32_t Strigi::Soprano::IndexReader::countKeywords( const std::string& keywordprefix,
00486 const std::vector<std::string>& fieldnames)
00487 {
00488 Q_UNUSED(keywordprefix);
00489 Q_UNUSED(fieldnames);
00490
00491 qDebug() << "IndexReader::countKeywords in thread" << QThread::currentThread();
00492
00493 return 2;
00494 }
00495
00496
00497 std::vector<std::string> Strigi::Soprano::IndexReader::keywords( const std::string& keywordmatch,
00498 const std::vector<std::string>& fieldnames,
00499 uint32_t max, uint32_t offset )
00500 {
00501 Q_UNUSED(keywordmatch);
00502 Q_UNUSED(fieldnames);
00503 Q_UNUSED(max);
00504 Q_UNUSED(offset);
00505
00506 qDebug() << "IndexReader::keywords in thread" << QThread::currentThread();
00507
00508 return std::vector<std::string>();
00509 }