00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include "sopranoindexwriter.h"
00021 #include "util.h"
00022
00023 #include <Soprano/Soprano>
00024 #include <Soprano/Index/IndexFilterModel>
00025 #include <Soprano/Index/CLuceneIndex>
00026 #include <Soprano/Vocabulary/RDF>
00027 #include <Soprano/Vocabulary/Xesam>
00028 #include <Soprano/LiteralValue>
00029
00030 #include <QtCore/QList>
00031 #include <QtCore/QHash>
00032 #include <QtCore/QVariant>
00033 #include <QtCore/QFileInfo>
00034 #include <QtCore/QFile>
00035 #include <QtCore/QUrl>
00036 #include <QtCore/QDebug>
00037 #include <QtCore/QThread>
00038 #include <QtCore/QDateTime>
00039
00040 #include <sys/stat.h>
00041 #include <stdlib.h>
00042 #include <string.h>
00043 #include <errno.h>
00044
00045 #include <map>
00046 #include <sstream>
00047 #include <algorithm>
00048
00049
00050
00051
00052 using namespace Soprano;
00053
00054
00055 uint qHash( const std::string& s )
00056 {
00057 return qHash( s.c_str() );
00058 }
00059
00060 namespace {
00061 QString findArchivePath( const QString& path ) {
00062 QString p( path );
00063 int i = 0;
00064 while ( ( i = p.lastIndexOf( '/' ) ) > 0 ) {
00065 p.truncate( i );
00066 if ( QFileInfo( p ).isFile() ) {
00067 return p;
00068 }
00069 }
00070 return QString();
00071 }
00072
00073 QUrl createResourceUri( const Strigi::AnalysisResult* idx ) {
00074
00075
00076
00077
00078
00079 QString path = QFile::decodeName( idx->path().c_str() );
00080 QUrl url = QUrl::fromLocalFile( QFileInfo( path ).absoluteFilePath() );
00081 if ( idx->depth() > 0 ) {
00082 QString archivePath = findArchivePath( path );
00083 if ( QFile::exists( archivePath ) ) {
00084 if ( archivePath.endsWith( QLatin1String( ".tar" ) ) ||
00085 archivePath.endsWith( QLatin1String( ".tar.gz" ) ) ||
00086 archivePath.endsWith( QLatin1String( ".tar.bz2" ) ) ) {
00087 url.setScheme( "tar" );
00088 }
00089 else if ( archivePath.endsWith( QLatin1String( ".zip" ) ) ) {
00090 url.setScheme( "zip" );
00091 }
00092 }
00093 }
00094
00095
00096 if ( url.scheme().isEmpty() ) {
00097 url.setScheme( "file" );
00098 }
00099
00100 return url;
00101 }
00102
00103 class FileMetaData
00104 {
00105 public:
00106
00107 QUrl fileUri;
00108 QUrl context;
00109 std::string content;
00110 };
00111
00112 class RegisteredFieldData
00113 {
00114 public:
00115 RegisteredFieldData( const QUrl& prop, QVariant::Type t )
00116 : property( prop ),
00117 dataType( t ),
00118 isRdfType( prop == Vocabulary::RDF::type() ) {
00119 }
00120
00121 QUrl property;
00122 QVariant::Type dataType;
00123 bool isRdfType;
00124 };
00125 }
00126
00127
00128 class Strigi::Soprano::IndexWriter::Private
00129 {
00130 public:
00131 Private()
00132 : indexTransactionID( 0 ) {
00133 literalTypes[FieldRegister::stringType] = QVariant::String;
00134 literalTypes[FieldRegister::floatType] = QVariant::Double;
00135 literalTypes[FieldRegister::integerType] = QVariant::Int;
00136 literalTypes[FieldRegister::binaryType] = QVariant::ByteArray;
00137 literalTypes[FieldRegister::datetimeType] = QVariant::DateTime;
00138 }
00139
00140 QVariant::Type literalType( const Strigi::FieldProperties& strigiType ) {
00141
00142 QHash<std::string, QVariant::Type>::const_iterator it = literalTypes.find( strigiType.typeUri() );
00143 if ( it == literalTypes.constEnd() ) {
00144 return LiteralValue::typeFromDataTypeUri( QUrl::fromEncoded( strigiType.typeUri().c_str() ) );
00145 }
00146 else {
00147 return *it;
00148 }
00149 }
00150
00151 LiteralValue createLiteralValue( QVariant::Type type,
00152 const unsigned char* data,
00153 uint32_t size ) {
00154 QString value = QString::fromUtf8( ( const char* )data, size );
00155 if ( type == QVariant::DateTime ) {
00156 return LiteralValue( QDateTime::fromTime_t( value.toUInt() ) );
00157 }
00158 else {
00159 return LiteralValue::fromString( value, type );
00160 }
00161 }
00162
00163
00164 ::Soprano::Model* repository;
00165 int indexTransactionID;
00166
00167 private:
00168 QHash<std::string, QVariant::Type> literalTypes;
00169 };
00170
00171
00172 Strigi::Soprano::IndexWriter::IndexWriter( ::Soprano::Model* model )
00173 : Strigi::IndexWriter()
00174 {
00175
00176 d = new Private;
00177 d->repository = model;
00178
00179 }
00180
00181
00182 Strigi::Soprano::IndexWriter::~IndexWriter()
00183 {
00184 delete d;
00185 }
00186
00187
00188 void Strigi::Soprano::IndexWriter::commit()
00189 {
00190 }
00191
00192
00193
00194 void Strigi::Soprano::IndexWriter::deleteEntries( const std::vector<std::string>& entries )
00195 {
00196
00197
00198 QString systemLocationUri = Util::fieldUri( FieldRegister::pathFieldName ).toString();
00199 for ( unsigned int i = 0; i < entries.size(); ++i ) {
00200 QString path = QString::fromUtf8( entries[i].c_str() );
00201 QString query = QString( "select ?g where { ?r <%1> \"%2\"^^<%3> . "
00202 "?g <http://www.strigi.org/fields#indexGraphFor> ?r . }" )
00203 .arg( systemLocationUri )
00204 .arg( path )
00205 .arg( Vocabulary::XMLSchema::string().toString() );
00206
00207 qDebug() << "deleteEntries query:" << query;
00208
00209 QueryResultIterator result = d->repository->executeQuery( query, ::Soprano::Query::QUERY_LANGUAGE_SPARQL );
00210 if ( result.next() ) {
00211 Node indexGraph = result.binding( "g" );
00212 result.close();
00213
00214 qDebug() << "Found indexGraph to delete:" << indexGraph;
00215
00216
00217 d->repository->removeContext( indexGraph );
00218
00219
00220 d->repository->removeAllStatements( Statement( indexGraph, Node(), Node() ) );
00221 }
00222 }
00223 }
00224
00225
00226 void Strigi::Soprano::IndexWriter::deleteAllEntries()
00227 {
00228
00229
00230
00231 QString query = QString( "select ?g where { ?g <http://www.strigi.org/fields#indexGraphFor> ?r . }" );
00232
00233 qDebug() << "deleteAllEntries query:" << query;
00234
00235 QueryResultIterator result = d->repository->executeQuery( query, ::Soprano::Query::QUERY_LANGUAGE_SPARQL );
00236 QList<Node> allIndexGraphs = result.iterateBindings( "g" ).allNodes();
00237 for ( QList<Node>::const_iterator it = allIndexGraphs.constBegin(); it != allIndexGraphs.constEnd(); ++it ) {
00238 Node indexGraph = *it;
00239
00240 qDebug() << "Found indexGraph to delete:" << indexGraph;
00241
00242
00243 d->repository->removeContext( indexGraph );
00244
00245
00246 d->repository->removeAllStatements( Statement( indexGraph, Node(), Node() ) );
00247 }
00248 }
00249
00250
00251
00252 void Strigi::Soprano::IndexWriter::startAnalysis( const AnalysisResult* idx )
00253 {
00254 if ( idx->depth() > 0 ) {
00255 return;
00256 }
00257
00258
00259 FileMetaData* data = new FileMetaData();
00260 data->fileUri = createResourceUri( idx );
00261
00262
00263 StatementIterator it = d->repository->listStatements( Node(),
00264 QUrl::fromEncoded( "http://www.strigi.org/fields#indexGraphFor", QUrl::StrictMode ),
00265 data->fileUri );
00266 if ( it.next() ) {
00267 data->context = it.current().subject().uri();
00268 }
00269 else {
00270 data->context = Util::uniqueUri( "http://www.strigi.org/contexts/", d->repository );
00271 }
00272
00273 qDebug() << "Starting analysis for" << data->fileUri << "in thread" << QThread::currentThread();
00274
00275 idx->setWriterData( data );
00276 }
00277
00278
00279 void Strigi::Soprano::IndexWriter::addText( const AnalysisResult* idx, const char* text, int32_t length )
00280 {
00281 if ( idx->depth() > 0 ) {
00282 return;
00283 }
00284
00285 FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
00286 md->content.append( text, length );
00287 }
00288
00289
00290 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
00291 const RegisteredField* field,
00292 const std::string& value )
00293 {
00294 if ( idx->depth() > 0 ) {
00295 return;
00296 }
00297
00298
00299 if ( value.length() > 0 ) {
00300 FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
00301 RegisteredFieldData* rfd = reinterpret_cast<RegisteredFieldData*>( field->writerData() );
00302
00303 if ( rfd->isRdfType ) {
00304
00305
00306
00307
00308
00309
00310 d->repository->addStatement( Statement( md->fileUri,
00311 ::Soprano::Vocabulary::RDF::type(),
00312 QUrl::fromEncoded( value.c_str(), QUrl::StrictMode ),
00313 md->context) );
00314 d->repository->addStatement( Statement( md->fileUri,
00315 QUrl::fromEncoded( "http://strigi.sourceforge.net/fields#rdf-string-type", QUrl::StrictMode ),
00316 LiteralValue( QString::fromUtf8( value.c_str() ) ),
00317 md->context) );
00318 }
00319
00320 else {
00321 if ( rfd->dataType != QVariant::Invalid ) {
00322 d->repository->addStatement( Statement( md->fileUri,
00323 rfd->property,
00324 d->createLiteralValue( rfd->dataType, ( unsigned char* )value.c_str(), value.length() ),
00325 md->context) );
00326 }
00327 else {
00328 qDebug() << "Ignoring field" << rfd->property << "due to unknown type" << field->properties().typeUri().c_str();
00329 }
00330 }
00331 }
00332
00333 }
00334
00335
00336
00337 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
00338 const RegisteredField* field,
00339 const unsigned char* data,
00340 uint32_t size )
00341 {
00342 addValue( idx, field, std::string( ( const char* )data, size ) );
00343 }
00344
00345
00346 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult*, const RegisteredField*,
00347 const std::string&, const std::string& )
00348 {
00349
00350 }
00351
00352
00353 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
00354 const RegisteredField* field,
00355 uint32_t value )
00356 {
00357 if ( idx->depth() > 0 ) {
00358 return;
00359 }
00360
00361
00362 FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
00363 RegisteredFieldData* rfd = reinterpret_cast<RegisteredFieldData*>( field->writerData() );
00364
00365 LiteralValue val( value );
00366 if ( field->type() == FieldRegister::datetimeType ) {
00367 val = QDateTime::fromTime_t( value );
00368 }
00369
00370 d->repository->addStatement( Statement( md->fileUri,
00371 rfd->property,
00372 val,
00373 md->context) );
00374
00375 }
00376
00377
00378 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
00379 const RegisteredField* field,
00380 int32_t value )
00381 {
00382 if ( idx->depth() > 0 ) {
00383 return;
00384 }
00385
00386
00387 FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
00388 RegisteredFieldData* rfd = reinterpret_cast<RegisteredFieldData*>( field->writerData() );
00389
00390 d->repository->addStatement( Statement( md->fileUri,
00391 rfd->property,
00392 LiteralValue( value ),
00393 md->context) );
00394
00395 }
00396
00397
00398 void Strigi::Soprano::IndexWriter::addValue( const AnalysisResult* idx,
00399 const RegisteredField* field,
00400 double value )
00401 {
00402 if ( idx->depth() > 0 ) {
00403 return;
00404 }
00405
00406
00407 FileMetaData* md = reinterpret_cast<FileMetaData*>( idx->writerData() );
00408 RegisteredFieldData* rfd = reinterpret_cast<RegisteredFieldData*>( field->writerData() );
00409
00410 d->repository->addStatement( Statement( md->fileUri,
00411 rfd->property,
00412 LiteralValue( value ),
00413 md->context) );
00414
00415 }
00416
00417
00418 void Strigi::Soprano::IndexWriter::addTriplet( const std::string& subject,
00419 const std::string& predicate, const std::string& object )
00420 {
00421
00422
00423
00424
00425 d->repository->addStatement( Statement( Node( QUrl( QString::fromUtf8( subject.c_str() ) ) ),
00426 Node( QUrl( QString::fromUtf8( predicate.c_str() ) ) ),
00427 Node( QUrl( QString::fromUtf8( object.c_str() ) ) ),
00428 Node() ) );
00429 }
00430
00431
00432
00433 void Strigi::Soprano::IndexWriter::finishAnalysis( const AnalysisResult* idx )
00434 {
00435 if ( idx->depth() > 0 ) {
00436 return;
00437 }
00438
00439
00440 FileMetaData* md = static_cast<FileMetaData*>( idx->writerData() );
00441
00442 if ( md->content.length() > 0 ) {
00443 d->repository->addStatement( Statement( md->fileUri,
00444 Vocabulary::Xesam::asText(),
00445 LiteralValue( QString::fromUtf8( md->content.c_str() ) ),
00446 md->context ) );
00447 }
00448
00449
00450
00451 d->repository->addStatement( Statement( md->fileUri,
00452 Vocabulary::RDF::type(),
00453 Vocabulary::Xesam::File(),
00454 md->context ) );
00455
00456
00457
00458
00459 QUrl metaDataContext = Util::uniqueUri( "http://www.strigi.org/graphMetaData/", d->repository );
00460 d->repository->addStatement( Statement( md->context,
00461 Vocabulary::RDF::type(),
00462 Vocabulary::NRL::InstanceBase(),
00463 metaDataContext ) );
00464 d->repository->addStatement( Statement( md->context,
00465 Vocabulary::NAO::created(),
00466 LiteralValue( QDateTime::currentDateTime() ),
00467 metaDataContext ) );
00468 d->repository->addStatement( Statement( md->context,
00469 QUrl::fromEncoded( "http://www.strigi.org/fields#indexGraphFor", QUrl::StrictMode ),
00470 md->fileUri,
00471 metaDataContext ) );
00472 d->repository->addStatement( Statement( metaDataContext,
00473 Vocabulary::RDF::type(),
00474 Vocabulary::NRL::GraphMetadata(),
00475 metaDataContext ) );
00476
00477
00478 delete md;
00479 idx->setWriterData( 0 );
00480
00481
00482 }
00483
00484
00485 void Strigi::Soprano::IndexWriter::initWriterData( const Strigi::FieldRegister& f )
00486 {
00487 map<string, RegisteredField*>::const_iterator i;
00488 map<string, RegisteredField*>::const_iterator end = f.fields().end();
00489 for (i = f.fields().begin(); i != end; ++i) {
00490 QUrl prop = Util::fieldUri( i->second->key() );
00491 i->second->setWriterData( new RegisteredFieldData( prop,
00492 prop == Vocabulary::RDF::type()
00493 ? QVariant::Invalid
00494 : d->literalType( i->second->properties() ) ) );
00495 }
00496 }
00497
00498
00499 void Strigi::Soprano::IndexWriter::releaseWriterData( const Strigi::FieldRegister& f )
00500 {
00501 map<string, RegisteredField*>::const_iterator i;
00502 map<string, RegisteredField*>::const_iterator end = f.fields().end();
00503 for (i = f.fields().begin(); i != end; ++i) {
00504 delete static_cast<RegisteredFieldData*>( i->second->writerData() );
00505 i->second->setWriterData( 0 );
00506 }
00507 }