• Skip to content
  • Skip to link menu
KDE 4.1 API Reference
  • KDE API Reference
  • API Reference
  • Sitemap
  • Contact Us
 

NepomukDaemons

sopranoindexwriter.cpp

Go to the documentation of this file.
00001 /*
00002    Copyright (C) 2007-2008 Sebastian Trueg <trueg@kde.org>
00003 
00004    This library is free software; you can redistribute it and/or
00005    modify it under the terms of the GNU General Public License as
00006    published by the Free Software Foundation; either version 2 of
00007    the License, or (at your option) any later version.
00008 
00009    This library is distributed in the hope that it will be useful,
00010    but WITHOUT ANY WARRANTY; without even the implied warranty of
00011    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00012    Library General Public License for more details.
00013 
00014    You should have received a copy of the GNU General Public License
00015    along with this library; see the file COPYING.  If not, write to
00016    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00017    Boston, MA 02110-1301, USA.
00018  */
00019 
00020 #include "sopranoindexwriter.h"
00021 #include "util.h"
00022 
00023 #include <Soprano/Soprano>
00024 #include <Soprano/Index/IndexFilterModel>
00025 #include <Soprano/Index/CLuceneIndex>
00026 #include <Soprano/Vocabulary/RDF>
00027 #include <Soprano/Vocabulary/Xesam>
00028 #include <Soprano/LiteralValue>
00029 
00030 #include <QtCore/QList>
00031 #include <QtCore/QHash>
00032 #include <QtCore/QVariant>
00033 #include <QtCore/QFileInfo>
00034 #include <QtCore/QFile>
00035 #include <QtCore/QUrl>
00036 #include <QtCore/QDebug>
00037 #include <QtCore/QThread>
00038 #include <QtCore/QDateTime>
00039 
00040 #include <sys/stat.h>
00041 #include <stdlib.h>
00042 #include <string.h>
00043 #include <errno.h>
00044 
00045 #include <map>
00046 #include <sstream>
00047 #include <algorithm>
00048 
00049 
00050 // IMPORTANT: strings in Strigi are apparently UTF8! Except for file names. Those are in local encoding.
00051 
00052 using namespace Soprano;
00053 
00054 
00055 uint qHash( const std::string& s )
00056 {
00057     return qHash( s.c_str() );
00058 }
00059 
00060 namespace {
00061     QString findArchivePath( const QString& path ) {
00062         QString p( path );
00063         int i = 0;
00064         while ( ( i = p.lastIndexOf( '/' ) ) > 0 ) {
00065             p.truncate( i );
00066             if ( QFileInfo( p ).isFile() ) {
00067                 return p;
00068             }
00069         }
00070         return QString();
00071     }
00072 
00073     QUrl createResourceUri( const Strigi::AnalysisResult* idx ) {
00074         // HACK: Strigi includes analysers that recurse into tar or zip archives and index
00075         // the files therein. In KDE these files could perfectly be handled through kio slaves
00076         // such as tar:/ or zip:/
00077         // Here we try to use KDE-compatible URIs for these indexed files the best we can
00078         // everything else defaults to file:/
00079         QString path = QFile::decodeName( idx->path().c_str() );
00080         QUrl url = QUrl::fromLocalFile( QFileInfo( path ).absoluteFilePath() );
00081         if ( idx->depth() > 0 ) {
00082             QString archivePath = findArchivePath( path );
00083             if ( QFile::exists( archivePath ) ) {
00084                 if ( archivePath.endsWith( QLatin1String( ".tar" ) ) ||
00085                      archivePath.endsWith( QLatin1String( ".tar.gz" ) ) ||
00086                      archivePath.endsWith( QLatin1String( ".tar.bz2" ) ) ) {
00087                     url.setScheme( "tar" );
00088                 }
00089                 else if ( archivePath.endsWith( QLatin1String( ".zip" ) ) ) {
00090                     url.setScheme( "zip" );
00091                 }
00092             }
00093         }
00094 
00095         // fallback for all
00096         if ( url.scheme().isEmpty() ) {
00097             url.setScheme( "file" );
00098         }
00099 
00100         return url;
00101     }
00102 
00103     class FileMetaData
00104     {
00105     public:
00106         // caching URIs for little speed improvement
00107         QUrl fileUri;
00108         QUrl context;
00109         std::string content;
00110     };
00111 
00112     class RegisteredFieldData
00113     {
00114     public:
00115         RegisteredFieldData( const QUrl& prop, QVariant::Type t )
00116             : property( prop ),
00117               dataType( t ),
00118               isRdfType( prop == Vocabulary::RDF::type() ) {
00119         }
00120 
00121         QUrl property;
00122         QVariant::Type dataType;
00123         bool isRdfType;
00124     };
00125 }
00126 
00127 
00128 class Strigi::Soprano::IndexWriter::Private
00129 {
00130 public:
00131     Private()
00132         : indexTransactionID( 0 ) {
00133         literalTypes[FieldRegister::stringType] = QVariant::String;
00134         literalTypes[FieldRegister::floatType] = QVariant::Double;
00135         literalTypes[FieldRegister::integerType] = QVariant::Int;
00136         literalTypes[FieldRegister::binaryType] = QVariant::ByteArray;
00137         literalTypes[FieldRegister::datetimeType] = QVariant::DateTime; // Strigi encodes datetime as unsigned integer, i.e. addValue( ..., uint )
00138     }
00139 
00140     QVariant::Type literalType( const Strigi::FieldProperties& strigiType ) {
00141         // it looks as if the typeUri can contain arbitrary values, URIs or stuff like "string"
00142         QHash<std::string, QVariant::Type>::const_iterator it = literalTypes.find( strigiType.typeUri() );
00143         if ( it == literalTypes.constEnd() ) {
00144             return LiteralValue::typeFromDataTypeUri( QUrl::fromEncoded( strigiType.typeUri().c_str() ) );
00145         }
00146         else {
00147             return *it;
00148         }
00149     }
00150 
00151     LiteralValue createLiteralValue( QVariant::Type type,
00152                                      const unsigned char* data,
00153                                      uint32_t size ) {
00154         QString value = QString::fromUtf8( ( const char* )data, size );
00155         if ( type == QVariant::DateTime ) { // dataTime is stored as integer in strigi!
00156             return LiteralValue( QDateTime::fromTime_t( value.toUInt() ) );
00157         }
00158         else {
00159             return LiteralValue::fromString( value, type );
00160         }
00161     }
00162 
00163 //    ::Soprano::Index::IndexFilterModel* repository;
00164     ::Soprano::Model* repository;
00165     int indexTransactionID;
00166 
00167 private:
00168     QHash<std::string, QVariant::Type> literalTypes;
00169 };
00170 
00171 
00172 Strigi::Soprano::IndexWriter::IndexWriter( ::Soprano::Model* model )
00173     : Strigi::IndexWriter()
00174 {
00175 //    qDebug() << "IndexWriter::IndexWriter in thread" << QThread::currentThread();
00176     d = new Private;
00177     d->repository = model;
00178 //    qDebug() << "IndexWriter::IndexWriter done in thread" << QThread::currentThread();
00179 }
00180 
00181 
00182 Strigi::Soprano::IndexWriter::~IndexWriter()
00183 {
00184     delete d;
00185 }
00186 
00187 
00188 void Strigi::Soprano::IndexWriter::commit()
00189 {
00190 }
00191 
00192 
00193 // delete all indexed data for the files listed in entries
00194 void Strigi::Soprano::IndexWriter::deleteEntries( const std::vector<std::string>& entries )
00195 {
00196 //    qDebug() << "IndexWriter::deleteEntries in thread" << QThread::currentThread();
00197 
00198     QString systemLocationUri = Util::fieldUri( FieldRegister::pathFieldName ).toString();
00199     for ( unsigned int i = 0; i < entries.size(); ++i ) {
00200         QString path = QString::fromUtf8( entries[i].c_str() );
00201         QString query = QString( "select ?g where { ?r <%1> \"%2\"^^<%3> . "
00202                                  "?g <http://www.strigi.org/fields#indexGraphFor> ?r . }" )
00203                         .arg( systemLocationUri )
00204                         .arg( path )
00205                         .arg( Vocabulary::XMLSchema::string().toString() );
00206 
00207         qDebug() << "deleteEntries query:" << query;
00208 
00209         QueryResultIterator result = d->repository->executeQuery( query, ::Soprano::Query::QUERY_LANGUAGE_SPARQL );
00210         if ( result.next() ) {
00211             Node indexGraph = result.binding( "g" );
00212             result.close();
00213 
00214             qDebug() << "Found indexGraph to delete:" << indexGraph;
00215 
00216             // delete the indexed data
00217             d->repository->removeContext( indexGraph );
00218 
00219             // delete the metadata
00220             d->repository->removeAllStatements( Statement( indexGraph, Node(), Node() ) );
00221         }
00222     }
00223 }
00224 
00225 
00226 void Strigi::Soprano::IndexWriter::deleteAllEntries()
00227 {
00228 //    qDebug() << "IndexWriter::deleteAllEntries in thread" << QThread::currentThread();
00229 
00230     // query all index graphs (FIXME: would a type derived from nrl:Graph be better than only the predicate?)
00231     QString query = QString( "select ?g where { ?g <http://www.strigi.org/fields#indexGraphFor> ?r . }" );
00232 
00233     qDebug() << "deleteAllEntries query:" << query;
00234 
00235     QueryResultIterator result = d->repository->executeQuery( query, ::Soprano::Query::QUERY_LANGUAGE_SPARQL );
00236     QList<Node> allIndexGraphs = result.iterateBindings( "g" ).allNodes();
00237     for ( QList<Node>::const_iterator it = allIndexGraphs.constBegin(); it != allIndexGraphs.constEnd(); ++it ) {
00238         Node indexGraph = *it;
00239 
00240         qDebug() << "Found indexGraph to delete:" << indexGraph;
00241 
00242         // delete the indexed data
00243         d->repository->removeContext( indexGraph );
00244 
00245         // delete the metadata
00246         d->repository->removeAllStatements( Statement( indexGraph, Node(), Node() ) );
00247     }
00248 }
00249 
00250 
00251 // called for each indexed file
00252 void Strigi::Soprano::IndexWriter::startAnalysis( const AnalysisResult* idx )
00253 {
00254     if ( idx->depth() > 0 ) {
00255         return;
00256     }
00257 
00258 //    qDebug() << "IndexWriter::startAnalysis in thread" << QThread::currentThread();
00259     FileMetaData* data = new FileMetaData();
00260     data->fileUri = createResourceUri( idx );
00261 
00262     // let's check if we already have data on the file
00263     StatementIterator it = d->repository->listStatements( Node(),
00264                                                           QUrl::fromEncoded( "http://www.strigi.org/fields#indexGraphFor", QUrl::StrictMode ), // FIXME: put the URI somewhere else
00265                                                           data->fileUri );
00266     if ( it.next() ) {
00267         data->context = it.current().subject().uri();
00268     }
00269     else {
00270         data->context = Util::uniqueUri( "http://www.strigi.org/contexts/", d->repository );
00271     }
00272 
00273     qDebug() << "Starting analysis for" << data->fileUri << "in thread" << QThread::currentThread();
00274 
00275     idx->setWriterData( data );
00276 }
00277 
00278 
00279 void Strigi::Soprano::IndexWriter::addText( const AnalysisResult* idx, const char* text, int32_t length )
00280 {
00281     if ( idx->depth() > 0 ) {
00282         return;
00283     }
00284 
00285     FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
00286     md->content.append( text, length );
00287 }
00288 
00289 
00290 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
00291                                              const RegisteredField* field,
00292                                              const std::string& value )
00293 {
00294     if ( idx->depth() > 0 ) {
00295         return;
00296     }
00297 
00298 //    qDebug() << "IndexWriter::addValue in thread" << QThread::currentThread();
00299     if ( value.length() > 0 ) {
00300         FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
00301         RegisteredFieldData* rfd = reinterpret_cast<RegisteredFieldData*>( field->writerData() );
00302 
00303         if ( rfd->isRdfType ) {
00304 
00305             // Strigi uses rdf:type improperly since it stores the value as a string. We have to
00306             // make sure it is a resource. The problem is that this results in the type not being
00307             // indexed properly. Thus, it cannot be searched with normal lucene queries.
00308             // That is why we need to introduce a stringType property
00309 
00310             d->repository->addStatement( Statement( md->fileUri,
00311                                                     ::Soprano::Vocabulary::RDF::type(),
00312                                                     QUrl::fromEncoded( value.c_str(), QUrl::StrictMode ), // fromEncoded is faster than the plain constructor and all Xesam URIs work here
00313                                                     md->context) );
00314             d->repository->addStatement( Statement( md->fileUri,
00315                                                     QUrl::fromEncoded( "http://strigi.sourceforge.net/fields#rdf-string-type", QUrl::StrictMode ),
00316                                                     LiteralValue( QString::fromUtf8( value.c_str() ) ),
00317                                                     md->context) );
00318         }
00319 
00320         else {
00321             if ( rfd->dataType != QVariant::Invalid ) {
00322                 d->repository->addStatement( Statement( md->fileUri,
00323                                                         rfd->property,
00324                                                         d->createLiteralValue( rfd->dataType, ( unsigned char* )value.c_str(), value.length() ),
00325                                                         md->context) );
00326             }
00327             else {
00328                 qDebug() << "Ignoring field" << rfd->property << "due to unknown type" << field->properties().typeUri().c_str();
00329             }
00330         }
00331     }
00332 //    qDebug() << "IndexWriter::addValue done in thread" << QThread::currentThread();
00333 }
00334 
00335 
00336 // the main addValue method
00337 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
00338                                              const RegisteredField* field,
00339                                              const unsigned char* data,
00340                                              uint32_t size )
00341 {
00342     addValue( idx, field, std::string( ( const char* )data, size ) );
00343 }
00344 
00345 
00346 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult*, const RegisteredField*,
00347                                              const std::string&, const std::string& )
00348 {
00349     // we do not support map types
00350 }
00351 
00352 
00353 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
00354                                              const RegisteredField* field,
00355                                              uint32_t value )
00356 {
00357     if ( idx->depth() > 0 ) {
00358         return;
00359     }
00360 
00361 //    qDebug() << "IndexWriter::addValue in thread" << QThread::currentThread();
00362     FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
00363     RegisteredFieldData* rfd = reinterpret_cast<RegisteredFieldData*>( field->writerData() );
00364 
00365     LiteralValue val( value );
00366     if ( field->type() == FieldRegister::datetimeType ) {
00367         val = QDateTime::fromTime_t( value );
00368     }
00369 
00370     d->repository->addStatement( Statement( md->fileUri,
00371                                             rfd->property,
00372                                             val,
00373                                             md->context) );
00374 //    qDebug() << "IndexWriter::addValue done in thread" << QThread::currentThread();
00375 }
00376 
00377 
00378 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
00379                                              const RegisteredField* field,
00380                                              int32_t value )
00381 {
00382     if ( idx->depth() > 0 ) {
00383         return;
00384     }
00385 
00386 //    qDebug() << "IndexWriter::addValue in thread" << QThread::currentThread();
00387     FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
00388     RegisteredFieldData* rfd = reinterpret_cast<RegisteredFieldData*>( field->writerData() );
00389 
00390     d->repository->addStatement( Statement( md->fileUri,
00391                                             rfd->property,
00392                                             LiteralValue( value ),
00393                                             md->context) );
00394 //    qDebug() << "IndexWriter::addValue done in thread" << QThread::currentThread();
00395 }
00396 
00397 
00398 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
00399                                              const RegisteredField* field,
00400                                              double value )
00401 {
00402     if ( idx->depth() > 0 ) {
00403         return;
00404     }
00405 
00406 //    qDebug() << "IndexWriter::addValue in thread" << QThread::currentThread();
00407     FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
00408     RegisteredFieldData* rfd = reinterpret_cast<RegisteredFieldData*>( field->writerData() );
00409 
00410     d->repository->addStatement( Statement( md->fileUri,
00411                                             rfd->property,
00412                                             LiteralValue( value ),
00413                                             md->context) );
00414 //    qDebug() << "IndexWriter::addValue done in thread" << QThread::currentThread();
00415 }
00416 
00417 
00418 void Strigi::Soprano::IndexWriter::addTriplet( const std::string& subject,
00419                                                const std::string& predicate, const std::string& object )
00420 {
00421     // PROBLEM: which named graph (context) should we use here? Create a new one for each triple? Use one until the
00422     // next commit()?
00423 
00424     // FIXME: create an NRL metadata graph
00425     d->repository->addStatement( Statement( Node( QUrl( QString::fromUtf8( subject.c_str() ) ) ),
00426                                             Node( QUrl( QString::fromUtf8( predicate.c_str() ) ) ),
00427                                             Node( QUrl( QString::fromUtf8( object.c_str() ) ) ),
00428                                             Node() ) );
00429 }
00430 
00431 
00432 // called after each indexed file
00433 void Strigi::Soprano::IndexWriter::finishAnalysis( const AnalysisResult* idx )
00434 {
00435     if ( idx->depth() > 0 ) {
00436         return;
00437     }
00438 
00439 //    qDebug() << "IndexWriter::finishAnalysis in thread" << QThread::currentThread();
00440     FileMetaData* md = static_cast<FileMetaData*>( idx->writerData() );
00441 
00442     if ( md->content.length() > 0 ) {
00443         d->repository->addStatement( Statement( md->fileUri,
00444                                                 Vocabulary::Xesam::asText(),
00445                                                 LiteralValue( QString::fromUtf8( md->content.c_str() ) ),
00446                                                 md->context ) );
00447     }
00448 
00449     // Strigi only indexes files and extractors mostly (if at all) store the xesam:DataObject type (i.e. the contents)
00450     // Thus, here we go the easy way and mark each indexed file as a xesam:File.
00451     d->repository->addStatement( Statement( md->fileUri,
00452                                             Vocabulary::RDF::type(),
00453                                             Vocabulary::Xesam::File(),
00454                                             md->context ) );
00455 
00456 
00457     // create the provedance data for the data graph
00458     // TODO: add more data at some point when it becomes of interest
00459     QUrl metaDataContext = Util::uniqueUri( "http://www.strigi.org/graphMetaData/", d->repository );
00460     d->repository->addStatement( Statement( md->context,
00461                                             Vocabulary::RDF::type(),
00462                                             Vocabulary::NRL::InstanceBase(),
00463                                             metaDataContext ) );
00464     d->repository->addStatement( Statement( md->context,
00465                                             Vocabulary::NAO::created(),
00466                                             LiteralValue( QDateTime::currentDateTime() ),
00467                                             metaDataContext ) );
00468     d->repository->addStatement( Statement( md->context,
00469                                             QUrl::fromEncoded( "http://www.strigi.org/fields#indexGraphFor", QUrl::StrictMode ), // FIXME: put the URI somewhere else
00470                                             md->fileUri,
00471                                             metaDataContext ) );
00472     d->repository->addStatement( Statement( metaDataContext,
00473                                             Vocabulary::RDF::type(),
00474                                             Vocabulary::NRL::GraphMetadata(),
00475                                             metaDataContext ) );
00476 
00477     // cleanup
00478     delete md;
00479     idx->setWriterData( 0 );
00480 
00481 //    qDebug() << "IndexWriter::finishAnalysis done in thread" << QThread::currentThread();
00482 }
00483 
00484 
00485 void Strigi::Soprano::IndexWriter::initWriterData( const Strigi::FieldRegister& f )
00486 {
00487     map<string, RegisteredField*>::const_iterator i;
00488     map<string, RegisteredField*>::const_iterator end = f.fields().end();
00489     for (i = f.fields().begin(); i != end; ++i) {
00490         QUrl prop = Util::fieldUri( i->second->key() );
00491         i->second->setWriterData( new RegisteredFieldData( prop,
00492                                                            prop == Vocabulary::RDF::type()
00493                                                            ? QVariant::Invalid
00494                                                            : d->literalType( i->second->properties() ) ) );
00495     }
00496 }
00497 
00498 
00499 void Strigi::Soprano::IndexWriter::releaseWriterData( const Strigi::FieldRegister& f )
00500 {
00501     map<string, RegisteredField*>::const_iterator i;
00502     map<string, RegisteredField*>::const_iterator end = f.fields().end();
00503     for (i = f.fields().begin(); i != end; ++i) {
00504         delete static_cast<RegisteredFieldData*>( i->second->writerData() );
00505         i->second->setWriterData( 0 );
00506     }
00507 }

NepomukDaemons

Skip menu "NepomukDaemons"
  • Main Page
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Namespace Members
  • Class Members
  • Related Pages

API Reference

Skip menu "API Reference"
  • KCMShell
  • KNotify
  • KStyles
  • Nepomuk Daemons
Generated for API Reference by doxygen 1.5.4
This website is maintained by Adriaan de Groot and Allen Winter.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. | Legal