diff options
Diffstat (limited to 'tools/assistant/lib/qhelpsearchindexwriter_clucene.cpp')
-rw-r--r-- | tools/assistant/lib/qhelpsearchindexwriter_clucene.cpp | 481 |
1 files changed, 481 insertions, 0 deletions
diff --git a/tools/assistant/lib/qhelpsearchindexwriter_clucene.cpp b/tools/assistant/lib/qhelpsearchindexwriter_clucene.cpp new file mode 100644 index 0000000..e53bbae --- /dev/null +++ b/tools/assistant/lib/qhelpsearchindexwriter_clucene.cpp @@ -0,0 +1,481 @@ +/**************************************************************************** +** +** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). +** Contact: Qt Software Information (qt-info@nokia.com) +** +** This file is part of the Qt Assistant of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the either Technology Preview License Agreement or the +** Beta Release License Agreement. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain +** additional rights. These rights are described in the Nokia Qt LGPL +** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this +** package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3.0 as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU General Public License version 3.0 requirements will be +** met: http://www.gnu.org/copyleft/gpl.html. +** +** If you are unsure which license is appropriate for your use, please +** contact the sales department at qt-sales@nokia.com. +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include "qhelpenginecore.h" +#include "qhelp_global.h" +#include "fulltextsearch/qhits_p.h" +#include "fulltextsearch/qquery_p.h" +#include "fulltextsearch/qanalyzer_p.h" +#include "fulltextsearch/qdocument_p.h" +#include "fulltextsearch/qsearchable_p.h" +#include "fulltextsearch/qindexreader_p.h" +#include "fulltextsearch/qindexwriter_p.h" +#include "qhelpsearchindexwriter_clucene_p.h" + +#include <QtCore/QDir> +#include <QtCore/QString> +#include <QtCore/QFileInfo> +#include <QtCore/QTextCodec> +#include <QtCore/QTextStream> + +#include <QtNetwork/QLocalSocket> +#include <QtNetwork/QLocalServer> + +QT_BEGIN_NAMESPACE + +namespace qt { + namespace fulltextsearch { + namespace clucene { + +class DocumentHelper +{ +public: + DocumentHelper(const QString& fileName, const QByteArray &data) + : fileName(fileName) , data(readData(data)) {} + ~DocumentHelper() {} + + bool addFieldsToDocument(QCLuceneDocument *document, + const QString &namespaceName, const QString &attributes = QString()) + { + if (!document) + return false; + + if(!data.isEmpty()) { + QString parsedData = parseData(); + QString parsedTitle = QHelpGlobal::documentTitle(data); + + if(!parsedData.isEmpty()) { + document->add(new QCLuceneField(QLatin1String("content"), + parsedData,QCLuceneField::INDEX_TOKENIZED)); + document->add(new QCLuceneField(QLatin1String("path"), fileName, + QCLuceneField::STORE_YES | QCLuceneField::INDEX_UNTOKENIZED)); + document->add(new QCLuceneField(QLatin1String("title"), parsedTitle, + QCLuceneField::STORE_YES | QCLuceneField::INDEX_UNTOKENIZED)); + document->add(new QCLuceneField(QLatin1String("titleTokenized"), parsedTitle, + QCLuceneField::STORE_YES | QCLuceneField::INDEX_TOKENIZED)); + document->add(new QCLuceneField(QLatin1String("namespace"), namespaceName, + QCLuceneField::STORE_YES | QCLuceneField::INDEX_UNTOKENIZED)); + document->add(new QCLuceneField(QLatin1String("attribute"), attributes, + QCLuceneField::STORE_YES | QCLuceneField::INDEX_TOKENIZED)); + return true; + } + } + + return false; + } + +private: + QString readData(const QByteArray &data) + { + QTextStream textStream(data); + QByteArray charSet = QHelpGlobal::charsetFromData(data).toLatin1(); + textStream.setCodec(QTextCodec::codecForName(charSet.constData())); + + QString stream = textStream.readAll(); + if (stream.isNull() || stream.isEmpty()) + return QString(); + + return stream; + } + + QString parseData() const + { + const int length = data.length(); + const QChar *buf = data.unicode(); + + QString parsedContent; + parsedContent.reserve(length); + + bool valid = true; + int j = 0, count = 0; + + QChar c; + while (j < length) { + c = buf[j++]; + if (c == QLatin1Char('<') || c == QLatin1Char('&')) { + if (count > 1) + parsedContent.append(QLatin1Char(' ')); + count = 0; + valid = false; + continue; + } + if ((c == QLatin1Char('>') || c == QLatin1Char(';')) && !valid) { + valid = true; + continue; + } + if (!valid) + continue; + + if (c.isLetterOrNumber() || c.isPrint()) { + ++count; + parsedContent.append(c.toLower()); + } else { + if (count > 1) + parsedContent.append(QLatin1Char(' ')); + count = 0; + } + } + + return parsedContent; + } + +private: + QString fileName; + QString data; +}; + + +QHelpSearchIndexWriter::QHelpSearchIndexWriter() + : QThread(0) + , m_cancel(false) +{ + // nothing todo +} + +QHelpSearchIndexWriter::~QHelpSearchIndexWriter() +{ + mutex.lock(); + this->m_cancel = true; + waitCondition.wakeOne(); + mutex.unlock(); + + wait(); +} + +void QHelpSearchIndexWriter::cancelIndexing() +{ + mutex.lock(); + this->m_cancel = true; + mutex.unlock(); +} + +void QHelpSearchIndexWriter::updateIndex(const QString &collectionFile, + const QString &indexFilesFolder, bool reindex) +{ + mutex.lock(); + this->m_cancel = false; + this->m_reindex = reindex; + this->m_collectionFile = collectionFile; + this->m_indexFilesFolder = indexFilesFolder; + mutex.unlock(); + + start(QThread::NormalPriority); +} + +void QHelpSearchIndexWriter::optimizeIndex() +{ + if (QCLuceneIndexReader::indexExists(m_indexFilesFolder)) { + if (QCLuceneIndexReader::isLocked(m_indexFilesFolder)) + return; + + QCLuceneStandardAnalyzer analyzer; + QCLuceneIndexWriter writer(m_indexFilesFolder, analyzer, false); + writer.optimize(); + writer.close(); + } +} + +void QHelpSearchIndexWriter::run() +{ + QMutexLocker mutexLocker(&mutex); + + if (m_cancel) + return; + + const bool reindex = this->m_reindex; + const QString collectionFile(this->m_collectionFile); + + mutexLocker.unlock(); + + QHelpEngineCore engine(collectionFile, 0); + if (!engine.setupData()) + return; + + const QLatin1String key("CluceneIndexedNamespaces"); + if (reindex) + engine.setCustomValue(key, QLatin1String("")); + + QMap<QString, QDateTime> indexMap; + const QLatin1String oldKey("CluceneSearchNamespaces"); + if (!engine.customValue(oldKey, QString()).isNull()) { + // old style qhc file < 4.4.2, need to convert... + const QStringList indexedNamespaces = engine.customValue(oldKey). + toString().split(QLatin1String("|"), QString::SkipEmptyParts); + foreach (const QString& nameSpace, indexedNamespaces) + indexMap.insert(nameSpace, QDateTime()); + engine.removeCustomValue(oldKey); + } else { + QDataStream dataStream(engine.customValue(key).toByteArray()); + dataStream >> indexMap; + } + + QString indexPath = m_indexFilesFolder; + + QFileInfo fInfo(indexPath); + if (fInfo.exists() && !fInfo.isWritable()) { + qWarning("Full Text Search, could not create index (missing permissions)."); + return; + } + + emit indexingStarted(); + + QCLuceneIndexWriter *writer = 0; + QCLuceneStandardAnalyzer analyzer; + const QStringList registeredDocs = engine.registeredDocumentations(); + + QLocalSocket localSocket; + localSocket.connectToServer(QString(QLatin1String("QtAssistant%1")) + .arg(QLatin1String(QT_VERSION_STR))); + + QLocalServer localServer; + bool otherInstancesRunning = true; + if (!localSocket.waitForConnected()) { + otherInstancesRunning = false; + localServer.listen(QString(QLatin1String("QtAssistant%1")) + .arg(QLatin1String(QT_VERSION_STR))); + } + +#if !defined(QT_NO_EXCEPTIONS) + try { +#endif + // check if it's locked, and if the other instance is running + if (!otherInstancesRunning && QCLuceneIndexReader::isLocked(indexPath)) + QCLuceneIndexReader::unlock(indexPath); + + if (QCLuceneIndexReader::isLocked(indexPath)) { + // poll unless indexing finished to fake progress + while (QCLuceneIndexReader::isLocked(indexPath)) { + mutexLocker.relock(); + if (m_cancel) + break; + mutexLocker.unlock(); + this->sleep(1); + } + emit indexingFinished(); + return; + } + + if (QCLuceneIndexReader::indexExists(indexPath) && !reindex) { + foreach(const QString& namespaceName, registeredDocs) { + mutexLocker.relock(); + if (m_cancel) { + emit indexingFinished(); + return; + } + mutexLocker.unlock(); + + if (!indexMap.contains(namespaceName)) { + // make sure we remove some partly indexed stuff + removeDocuments(indexPath, namespaceName); + } else { + QString path = engine.documentationFileName(namespaceName); + if (indexMap.value(namespaceName) < QFileInfo(path).lastModified()) { + // make sure we remove some outdated indexed stuff + indexMap.remove(namespaceName); + removeDocuments(indexPath, namespaceName); + } + + if (indexMap.contains(namespaceName)) { + // make sure we really have content indexed for namespace + // NOTE: Extra variable just for GCC 3.3.5 + QLatin1String key("namespace"); + QCLuceneTermQuery query(QCLuceneTerm(key, namespaceName)); + QCLuceneIndexSearcher indexSearcher(indexPath); + QCLuceneHits hits = indexSearcher.search(query); + if (hits.length() <= 0) + indexMap.remove(namespaceName); + } + } + } + writer = new QCLuceneIndexWriter(indexPath, analyzer, false); + } else { + indexMap.clear(); + writer = new QCLuceneIndexWriter(indexPath, analyzer, true); + } +#if !defined(QT_NO_EXCEPTIONS) + } catch (...) { + qWarning("Full Text Search, could not create index writer."); + return; + } +#endif + + writer->setMergeFactor(100); + writer->setMinMergeDocs(1000); + writer->setMaxFieldLength(QCLuceneIndexWriter::DEFAULT_MAX_FIELD_LENGTH); + + QStringList namespaces; + foreach(const QString& namespaceName, registeredDocs) { + mutexLocker.relock(); + if (m_cancel) { + writer->close(); + delete writer; + emit indexingFinished(); + return; + } + mutexLocker.unlock(); + + namespaces.append(namespaceName); + if (indexMap.contains(namespaceName)) + continue; + + const QList<QStringList> attributeSets = + engine.filterAttributeSets(namespaceName); + + if (attributeSets.isEmpty()) { + const QList<QUrl> docFiles = indexableFiles(&engine, namespaceName, + QStringList()); + if (!addDocuments(docFiles, engine, QStringList(), namespaceName, + writer, analyzer)) + break; + } else { + bool bail = false; + foreach (const QStringList& attributes, attributeSets) { + const QList<QUrl> docFiles = indexableFiles(&engine, + namespaceName, attributes); + if (!addDocuments(docFiles, engine, attributes, namespaceName, + writer, analyzer)) { + bail = true; + break; + } + } + if (bail) + break; + } + + mutexLocker.relock(); + if (!m_cancel) { + QString path(engine.documentationFileName(namespaceName)); + indexMap.insert(namespaceName, QFileInfo(path).lastModified()); + writeIndexMap(engine, indexMap); + } + mutexLocker.unlock(); + } + + writer->close(); + delete writer; + + mutexLocker.relock(); + if (!m_cancel) { + mutexLocker.unlock(); + + QStringList indexedNamespaces = indexMap.keys(); + foreach(const QString& namespaceName, indexedNamespaces) { + mutexLocker.relock(); + if (m_cancel) + break; + mutexLocker.unlock(); + + if (!namespaces.contains(namespaceName)) { + indexMap.remove(namespaceName); + writeIndexMap(engine, indexMap); + removeDocuments(indexPath, namespaceName); + } + } + } + emit indexingFinished(); +} + +bool QHelpSearchIndexWriter::addDocuments(const QList<QUrl> docFiles, + const QHelpEngineCore &engine, const QStringList &attributes, + const QString &namespaceName, QCLuceneIndexWriter *writer, + QCLuceneAnalyzer &analyzer) +{ + QMutexLocker locker(&mutex); + const QString attrList = attributes.join(QLatin1String(" ")); + + locker.unlock(); + foreach(const QUrl& url, docFiles) { + QCLuceneDocument document; + DocumentHelper helper(url.toString(), engine.fileData(url)); + if (helper.addFieldsToDocument(&document, namespaceName, attrList)) + writer->addDocument(document, analyzer); + + locker.relock(); + if (m_cancel) + return false; + locker.unlock(); + } + + return true; +} + +void QHelpSearchIndexWriter::removeDocuments(const QString &indexPath, + const QString &namespaceName) +{ + if (namespaceName.isEmpty() || QCLuceneIndexReader::isLocked(indexPath)) + return; + + QCLuceneIndexReader reader = QCLuceneIndexReader::open(indexPath); + reader.deleteDocuments(QCLuceneTerm(QLatin1String("namespace"), + namespaceName)); + + reader.close(); +} + +bool QHelpSearchIndexWriter::writeIndexMap(QHelpEngineCore& engine, + const QMap<QString, QDateTime>& indexMap) +{ + QByteArray bArray; + + QDataStream data(&bArray, QIODevice::ReadWrite); + data << indexMap; + + return engine.setCustomValue(QLatin1String("CluceneIndexedNamespaces"), + bArray); +} + +QList<QUrl> QHelpSearchIndexWriter::indexableFiles(QHelpEngineCore *helpEngine, + const QString &namespaceName, const QStringList &attributes) const +{ + QList<QUrl> docFiles = helpEngine->files(namespaceName, attributes, + QLatin1String("html")); + docFiles += helpEngine->files(namespaceName, attributes, QLatin1String("htm")); + docFiles += helpEngine->files(namespaceName, attributes, QLatin1String("txt")); + + return docFiles; +} + + + } // namespace clucene + } // namespace fulltextsearch +} // namespace qt + +QT_END_NAMESPACE |