summaryrefslogtreecommitdiffstats
path: root/tools/assistant
diff options
context:
space:
mode:
Diffstat (limited to 'tools/assistant')
-rw-r--r--tools/assistant/lib/qhelpsearchindexreader_clucene.cpp114
-rw-r--r--tools/assistant/lib/qhelpsearchindexreader_clucene_p.h15
-rw-r--r--tools/assistant/lib/qhelpsearchindexwriter_clucene.cpp408
3 files changed, 482 insertions, 55 deletions
diff --git a/tools/assistant/lib/qhelpsearchindexreader_clucene.cpp b/tools/assistant/lib/qhelpsearchindexreader_clucene.cpp
index 82a3a17..a1ba001 100644
--- a/tools/assistant/lib/qhelpsearchindexreader_clucene.cpp
+++ b/tools/assistant/lib/qhelpsearchindexreader_clucene.cpp
@@ -83,9 +83,8 @@ void QHelpSearchIndexReader::cancelSearching()
mutex.unlock();
}
-void QHelpSearchIndexReader::search(const QString &collectionFile,
- const QString &indexFilesFolder,
- const QList<QHelpSearchQuery> &queryList)
+void QHelpSearchIndexReader::search(const QString &collectionFile, const QString &indexFilesFolder,
+ const QList<QHelpSearchQuery> &queryList)
{
QMutexLocker lock(&mutex);
@@ -147,17 +146,16 @@ void QHelpSearchIndexReader::run()
try {
#endif
QCLuceneBooleanQuery booleanQuery;
- if (!buildQuery(booleanQuery, queryList)) {
+ QCLuceneStandardAnalyzer analyzer;
+ if (!buildQuery(booleanQuery, queryList, analyzer)) {
emit searchingFinished(0);
return;
}
const QStringList attribList = engine.filterAttributes(engine.currentFilter());
if (!attribList.isEmpty()) {
- QCLuceneStandardAnalyzer analyzer;
QCLuceneQuery* query = QCLuceneQueryParser::parse(QLatin1String("+")
- + attribList.join(QLatin1String(" +")), QLatin1String("attribute"),
- analyzer);
+ + attribList.join(QLatin1String(" +")), QLatin1String("attribute"), analyzer);
if (!query) {
emit searchingFinished(0);
@@ -168,10 +166,26 @@ void QHelpSearchIndexReader::run()
QCLuceneIndexSearcher indexSearcher(indexPath);
QCLuceneHits hits = indexSearcher.search(booleanQuery);
- const QStringList namespaceList = engine.registeredDocumentations();
+
+ bool boost = true;
+ QCLuceneBooleanQuery tryHarderQuery;
+ if (hits.length() == 0) {
+ if (buildTryHarderQuery(tryHarderQuery, queryList, analyzer)) {
+ if (!attribList.isEmpty()) {
+ QCLuceneQuery* query = QCLuceneQueryParser::parse(QLatin1String("+")
+ + attribList.join(QLatin1String(" +")), QLatin1String("attribute"),
+ analyzer);
+ tryHarderQuery.add(query, true, true, false);
+ }
+ hits = indexSearcher.search(tryHarderQuery);
+ boost = (hits.length() == 0);
+ }
+ }
QSet<QString> pathSet;
QCLuceneDocument document;
+ const QStringList namespaceList = engine.registeredDocumentations();
+
for (qint32 i = 0; i < hits.length(); i++) {
document = hits.document(i);
const QString path = document.get(QLatin1String("path"));
@@ -192,8 +206,8 @@ void QHelpSearchIndexReader::run()
}
indexSearcher.close();
- int count = hitList.count();
- if (count > 0)
+ const int count = hitList.count();
+ if ((count > 0) && boost)
boostSearchHits(engine, hitList, queryList);
emit searchingFinished(hitList.count());
@@ -206,11 +220,9 @@ void QHelpSearchIndexReader::run()
}
}
-bool QHelpSearchIndexReader::defaultQuery(const QString &term,
- QCLuceneBooleanQuery &booleanQuery)
+bool QHelpSearchIndexReader::defaultQuery(const QString &term, QCLuceneBooleanQuery &booleanQuery,
+ QCLuceneStandardAnalyzer &analyzer)
{
- QCLuceneStandardAnalyzer analyzer;
-
const QLatin1String c("content");
const QLatin1String t("titleTokenized");
@@ -226,21 +238,23 @@ bool QHelpSearchIndexReader::defaultQuery(const QString &term,
}
bool QHelpSearchIndexReader::buildQuery(QCLuceneBooleanQuery &booleanQuery,
- const QList<QHelpSearchQuery> &queryList)
+ const QList<QHelpSearchQuery> &queryList, QCLuceneStandardAnalyzer &analyzer)
{
foreach (const QHelpSearchQuery query, queryList) {
switch (query.fieldName) {
case QHelpSearchQuery::FUZZY: {
const QLatin1String fuzzy("~");
- foreach (const QString term, query.wordList) {
- if (term.isEmpty() || !defaultQuery(term.toLower() + fuzzy, booleanQuery))
+ foreach (const QString &term, query.wordList) {
+ if (term.isEmpty()
+ || !defaultQuery(term.toLower() + fuzzy, booleanQuery, analyzer)) {
return false;
+ }
}
} break;
case QHelpSearchQuery::WITHOUT: {
QStringList stopWords = QCLuceneStopAnalyzer().englishStopWords();
- foreach (const QString term, query.wordList) {
+ foreach (const QString &term, query.wordList) {
if (stopWords.contains(term, Qt::CaseInsensitive))
continue;
@@ -259,14 +273,14 @@ bool QHelpSearchIndexReader::buildQuery(QCLuceneBooleanQuery &booleanQuery,
} break;
case QHelpSearchQuery::PHRASE: {
- const QString term = query.wordList.at(0).toLower();
+ const QString &term = query.wordList.at(0).toLower();
if (term.contains(QLatin1Char(' '))) {
QStringList termList = term.split(QLatin1String(" "));
QCLucenePhraseQuery *q = new QCLucenePhraseQuery();
QStringList stopWords = QCLuceneStopAnalyzer().englishStopWords();
- foreach (const QString t, termList) {
- if (!stopWords.contains(t, Qt::CaseInsensitive))
- q->addTerm(QCLuceneTerm(QLatin1String("content"), t.toLower()));
+ foreach (const QString &term, termList) {
+ if (!stopWords.contains(term, Qt::CaseInsensitive))
+ q->addTerm(QCLuceneTerm(QLatin1String("content"), term.toLower()));
}
booleanQuery.add(q, true, true, false);
} else {
@@ -286,7 +300,7 @@ bool QHelpSearchIndexReader::buildQuery(QCLuceneBooleanQuery &booleanQuery,
case QHelpSearchQuery::ALL: {
QStringList stopWords = QCLuceneStopAnalyzer().englishStopWords();
- foreach (const QString term, query.wordList) {
+ foreach (const QString &term, query.wordList) {
if (stopWords.contains(term, Qt::CaseInsensitive))
continue;
@@ -302,9 +316,8 @@ bool QHelpSearchIndexReader::buildQuery(QCLuceneBooleanQuery &booleanQuery,
} break;
case QHelpSearchQuery::DEFAULT: {
- QCLuceneStandardAnalyzer analyzer;
- foreach (const QString t, query.wordList) {
- QCLuceneQuery *query = QCLuceneQueryParser::parse(t.toLower(),
+ foreach (const QString &term, query.wordList) {
+ QCLuceneQuery *query = QCLuceneQueryParser::parse(term.toLower(),
QLatin1String("content"), analyzer);
if (query)
@@ -313,8 +326,8 @@ bool QHelpSearchIndexReader::buildQuery(QCLuceneBooleanQuery &booleanQuery,
} break;
case QHelpSearchQuery::ATLEAST: {
- foreach (const QString term, query.wordList) {
- if (term.isEmpty() || !defaultQuery(term.toLower(), booleanQuery))
+ foreach (const QString &term, query.wordList) {
+ if (term.isEmpty() || !defaultQuery(term.toLower(), booleanQuery, analyzer))
return false;
}
}
@@ -324,16 +337,38 @@ bool QHelpSearchIndexReader::buildQuery(QCLuceneBooleanQuery &booleanQuery,
return true;
}
+bool QHelpSearchIndexReader::buildTryHarderQuery(QCLuceneBooleanQuery &booleanQuery,
+ const QList<QHelpSearchQuery> &queryList, QCLuceneStandardAnalyzer &analyzer)
+{
+ bool retVal = false;
+ foreach (const QHelpSearchQuery query, queryList) {
+ switch (query.fieldName) {
+ default: break;
+ case QHelpSearchQuery::DEFAULT: {
+ foreach (const QString &term, query.wordList) {
+ QCLuceneQuery *query = QCLuceneQueryParser::parse(term.toLower(),
+ QLatin1String("content"), analyzer);
+
+ if (query) {
+ retVal = true;
+ booleanQuery.add(query, true, false, false);
+ }
+ }
+ } break;
+ }
+ }
+ return retVal;
+}
+
void QHelpSearchIndexReader::boostSearchHits(const QHelpEngineCore &engine,
- QList<QHelpSearchEngine::SearchHit> &hitList,
- const QList<QHelpSearchQuery> &queryList)
+ QList<QHelpSearchEngine::SearchHit> &hitList, const QList<QHelpSearchQuery> &queryList)
{
foreach (const QHelpSearchQuery query, queryList) {
if (query.fieldName != QHelpSearchQuery::DEFAULT)
continue;
QString joinedQuery = query.wordList.join(QLatin1String(" "));
-
+
QCLuceneStandardAnalyzer analyzer;
QCLuceneQuery *parsedQuery = QCLuceneQueryParser::parse(
joinedQuery, QLatin1String("content"), analyzer);
@@ -351,8 +386,7 @@ void QHelpSearchIndexReader::boostSearchHits(const QHelpEngineCore &engine,
QStringList searchTerms;
while (index != -1) {
nextIndex = joinedQuery.indexOf(QLatin1String("content:"), index + 1);
- term = joinedQuery.mid(index + length, nextIndex - (length + index))
- .simplified();
+ term = joinedQuery.mid(index + length, nextIndex - (length + index)).simplified();
if (term.startsWith(QLatin1String("\""))
&& term.endsWith(QLatin1String("\""))) {
searchTerms.append(term.remove(QLatin1String("\"")));
@@ -370,17 +404,19 @@ void QHelpSearchIndexReader::boostSearchHits(const QHelpEngineCore &engine,
QString data = QString::fromUtf8(engine.fileData(hit.first));
int counter = 0;
- foreach (const QString& term, searchTerms)
+ foreach (const QString &term, searchTerms)
counter += data.count(term, Qt::CaseInsensitive);
hitMap.insertMulti(counter, hit);
}
QList<QHelpSearchEngine::SearchHit> boostedList;
- QMap<int, QHelpSearchEngine::SearchHit>::const_iterator i;
- for (i = hitMap.constEnd(), --i; i != hitMap.constBegin(); --i)
- boostedList.append(i.value());
- boostedList += hitList.mid(count - 1, hitList.count());
-
+ QMap<int, QHelpSearchEngine::SearchHit>::const_iterator it = hitMap.constEnd();
+ do {
+ --it;
+ boostedList.append(it.value());
+ } while (it != hitMap.constBegin());
+ boostedList += hitList.mid(count, hitList.count());
+
hitList = boostedList;
}
}
diff --git a/tools/assistant/lib/qhelpsearchindexreader_clucene_p.h b/tools/assistant/lib/qhelpsearchindexreader_clucene_p.h
index 892c4e6..f7536e9 100644
--- a/tools/assistant/lib/qhelpsearchindexreader_clucene_p.h
+++ b/tools/assistant/lib/qhelpsearchindexreader_clucene_p.h
@@ -54,6 +54,8 @@
//
#include "qhelpsearchengine.h"
+
+#include "fulltextsearch/qanalyzer_p.h"
#include "fulltextsearch/qquery_p.h"
#include <QtCore/QList>
@@ -93,12 +95,13 @@ signals:
private:
void run();
- bool defaultQuery(const QString &term,
- QCLuceneBooleanQuery &booleanQuery);
- bool buildQuery(QCLuceneBooleanQuery &booleanQuery,
- const QList<QHelpSearchQuery> &queryList);
- void boostSearchHits(const QHelpEngineCore &engine,
- QList<QHelpSearchEngine::SearchHit> &hitList,
+ bool defaultQuery(const QString &term, QCLuceneBooleanQuery &booleanQuery,
+ QCLuceneStandardAnalyzer &analyzer);
+ bool buildQuery(QCLuceneBooleanQuery &booleanQuery, const QList<QHelpSearchQuery> &queryList,
+ QCLuceneStandardAnalyzer &analyzer);
+ bool buildTryHarderQuery(QCLuceneBooleanQuery &booleanQuery,
+ const QList<QHelpSearchQuery> &queryList, QCLuceneStandardAnalyzer &analyzer);
+ void boostSearchHits(const QHelpEngineCore &engine, QList<QHelpSearchEngine::SearchHit> &hitList,
const QList<QHelpSearchQuery> &queryList);
private:
diff --git a/tools/assistant/lib/qhelpsearchindexwriter_clucene.cpp b/tools/assistant/lib/qhelpsearchindexwriter_clucene.cpp
index e53bbae..195c490 100644
--- a/tools/assistant/lib/qhelpsearchindexwriter_clucene.cpp
+++ b/tools/assistant/lib/qhelpsearchindexwriter_clucene.cpp
@@ -59,16 +59,340 @@
#include <QtNetwork/QLocalSocket>
#include <QtNetwork/QLocalServer>
+#include "private/qfunctions_p.h"
+
QT_BEGIN_NAMESPACE
namespace qt {
namespace fulltextsearch {
namespace clucene {
+// taken from qtexthtmlparser
+static const struct QTextHtmlEntity
+{
+ const char *name;
+ quint16 code;
+} entities[] = {
+ { "AElig", 0x00c6 },
+ { "AMP", 38 },
+ { "Aacute", 0x00c1 },
+ { "Acirc", 0x00c2 },
+ { "Agrave", 0x00c0 },
+ { "Alpha", 0x0391 },
+ { "Aring", 0x00c5 },
+ { "Atilde", 0x00c3 },
+ { "Auml", 0x00c4 },
+ { "Beta", 0x0392 },
+ { "Ccedil", 0x00c7 },
+ { "Chi", 0x03a7 },
+ { "Dagger", 0x2021 },
+ { "Delta", 0x0394 },
+ { "ETH", 0x00d0 },
+ { "Eacute", 0x00c9 },
+ { "Ecirc", 0x00ca },
+ { "Egrave", 0x00c8 },
+ { "Epsilon", 0x0395 },
+ { "Eta", 0x0397 },
+ { "Euml", 0x00cb },
+ { "GT", 62 },
+ { "Gamma", 0x0393 },
+ { "Iacute", 0x00cd },
+ { "Icirc", 0x00ce },
+ { "Igrave", 0x00cc },
+ { "Iota", 0x0399 },
+ { "Iuml", 0x00cf },
+ { "Kappa", 0x039a },
+ { "LT", 60 },
+ { "Lambda", 0x039b },
+ { "Mu", 0x039c },
+ { "Ntilde", 0x00d1 },
+ { "Nu", 0x039d },
+ { "OElig", 0x0152 },
+ { "Oacute", 0x00d3 },
+ { "Ocirc", 0x00d4 },
+ { "Ograve", 0x00d2 },
+ { "Omega", 0x03a9 },
+ { "Omicron", 0x039f },
+ { "Oslash", 0x00d8 },
+ { "Otilde", 0x00d5 },
+ { "Ouml", 0x00d6 },
+ { "Phi", 0x03a6 },
+ { "Pi", 0x03a0 },
+ { "Prime", 0x2033 },
+ { "Psi", 0x03a8 },
+ { "QUOT", 34 },
+ { "Rho", 0x03a1 },
+ { "Scaron", 0x0160 },
+ { "Sigma", 0x03a3 },
+ { "THORN", 0x00de },
+ { "Tau", 0x03a4 },
+ { "Theta", 0x0398 },
+ { "Uacute", 0x00da },
+ { "Ucirc", 0x00db },
+ { "Ugrave", 0x00d9 },
+ { "Upsilon", 0x03a5 },
+ { "Uuml", 0x00dc },
+ { "Xi", 0x039e },
+ { "Yacute", 0x00dd },
+ { "Yuml", 0x0178 },
+ { "Zeta", 0x0396 },
+ { "aacute", 0x00e1 },
+ { "acirc", 0x00e2 },
+ { "acute", 0x00b4 },
+ { "aelig", 0x00e6 },
+ { "agrave", 0x00e0 },
+ { "alefsym", 0x2135 },
+ { "alpha", 0x03b1 },
+ { "amp", 38 },
+ { "and", 0x22a5 },
+ { "ang", 0x2220 },
+ { "apos", 0x0027 },
+ { "aring", 0x00e5 },
+ { "asymp", 0x2248 },
+ { "atilde", 0x00e3 },
+ { "auml", 0x00e4 },
+ { "bdquo", 0x201e },
+ { "beta", 0x03b2 },
+ { "brvbar", 0x00a6 },
+ { "bull", 0x2022 },
+ { "cap", 0x2229 },
+ { "ccedil", 0x00e7 },
+ { "cedil", 0x00b8 },
+ { "cent", 0x00a2 },
+ { "chi", 0x03c7 },
+ { "circ", 0x02c6 },
+ { "clubs", 0x2663 },
+ { "cong", 0x2245 },
+ { "copy", 0x00a9 },
+ { "crarr", 0x21b5 },
+ { "cup", 0x222a },
+ { "curren", 0x00a4 },
+ { "dArr", 0x21d3 },
+ { "dagger", 0x2020 },
+ { "darr", 0x2193 },
+ { "deg", 0x00b0 },
+ { "delta", 0x03b4 },
+ { "diams", 0x2666 },
+ { "divide", 0x00f7 },
+ { "eacute", 0x00e9 },
+ { "ecirc", 0x00ea },
+ { "egrave", 0x00e8 },
+ { "empty", 0x2205 },
+ { "emsp", 0x2003 },
+ { "ensp", 0x2002 },
+ { "epsilon", 0x03b5 },
+ { "equiv", 0x2261 },
+ { "eta", 0x03b7 },
+ { "eth", 0x00f0 },
+ { "euml", 0x00eb },
+ { "euro", 0x20ac },
+ { "exist", 0x2203 },
+ { "fnof", 0x0192 },
+ { "forall", 0x2200 },
+ { "frac12", 0x00bd },
+ { "frac14", 0x00bc },
+ { "frac34", 0x00be },
+ { "frasl", 0x2044 },
+ { "gamma", 0x03b3 },
+ { "ge", 0x2265 },
+ { "gt", 62 },
+ { "hArr", 0x21d4 },
+ { "harr", 0x2194 },
+ { "hearts", 0x2665 },
+ { "hellip", 0x2026 },
+ { "iacute", 0x00ed },
+ { "icirc", 0x00ee },
+ { "iexcl", 0x00a1 },
+ { "igrave", 0x00ec },
+ { "image", 0x2111 },
+ { "infin", 0x221e },
+ { "int", 0x222b },
+ { "iota", 0x03b9 },
+ { "iquest", 0x00bf },
+ { "isin", 0x2208 },
+ { "iuml", 0x00ef },
+ { "kappa", 0x03ba },
+ { "lArr", 0x21d0 },
+ { "lambda", 0x03bb },
+ { "lang", 0x2329 },
+ { "laquo", 0x00ab },
+ { "larr", 0x2190 },
+ { "lceil", 0x2308 },
+ { "ldquo", 0x201c },
+ { "le", 0x2264 },
+ { "lfloor", 0x230a },
+ { "lowast", 0x2217 },
+ { "loz", 0x25ca },
+ { "lrm", 0x200e },
+ { "lsaquo", 0x2039 },
+ { "lsquo", 0x2018 },
+ { "lt", 60 },
+ { "macr", 0x00af },
+ { "mdash", 0x2014 },
+ { "micro", 0x00b5 },
+ { "middot", 0x00b7 },
+ { "minus", 0x2212 },
+ { "mu", 0x03bc },
+ { "nabla", 0x2207 },
+ { "nbsp", 0x00a0 },
+ { "ndash", 0x2013 },
+ { "ne", 0x2260 },
+ { "ni", 0x220b },
+ { "not", 0x00ac },
+ { "notin", 0x2209 },
+ { "nsub", 0x2284 },
+ { "ntilde", 0x00f1 },
+ { "nu", 0x03bd },
+ { "oacute", 0x00f3 },
+ { "ocirc", 0x00f4 },
+ { "oelig", 0x0153 },
+ { "ograve", 0x00f2 },
+ { "oline", 0x203e },
+ { "omega", 0x03c9 },
+ { "omicron", 0x03bf },
+ { "oplus", 0x2295 },
+ { "or", 0x22a6 },
+ { "ordf", 0x00aa },
+ { "ordm", 0x00ba },
+ { "oslash", 0x00f8 },
+ { "otilde", 0x00f5 },
+ { "otimes", 0x2297 },
+ { "ouml", 0x00f6 },
+ { "para", 0x00b6 },
+ { "part", 0x2202 },
+ { "percnt", 0x0025 },
+ { "permil", 0x2030 },
+ { "perp", 0x22a5 },
+ { "phi", 0x03c6 },
+ { "pi", 0x03c0 },
+ { "piv", 0x03d6 },
+ { "plusmn", 0x00b1 },
+ { "pound", 0x00a3 },
+ { "prime", 0x2032 },
+ { "prod", 0x220f },
+ { "prop", 0x221d },
+ { "psi", 0x03c8 },
+ { "quot", 34 },
+ { "rArr", 0x21d2 },
+ { "radic", 0x221a },
+ { "rang", 0x232a },
+ { "raquo", 0x00bb },
+ { "rarr", 0x2192 },
+ { "rceil", 0x2309 },
+ { "rdquo", 0x201d },
+ { "real", 0x211c },
+ { "reg", 0x00ae },
+ { "rfloor", 0x230b },
+ { "rho", 0x03c1 },
+ { "rlm", 0x200f },
+ { "rsaquo", 0x203a },
+ { "rsquo", 0x2019 },
+ { "sbquo", 0x201a },
+ { "scaron", 0x0161 },
+ { "sdot", 0x22c5 },
+ { "sect", 0x00a7 },
+ { "shy", 0x00ad },
+ { "sigma", 0x03c3 },
+ { "sigmaf", 0x03c2 },
+ { "sim", 0x223c },
+ { "spades", 0x2660 },
+ { "sub", 0x2282 },
+ { "sube", 0x2286 },
+ { "sum", 0x2211 },
+ { "sup", 0x2283 },
+ { "sup1", 0x00b9 },
+ { "sup2", 0x00b2 },
+ { "sup3", 0x00b3 },
+ { "supe", 0x2287 },
+ { "szlig", 0x00df },
+ { "tau", 0x03c4 },
+ { "there4", 0x2234 },
+ { "theta", 0x03b8 },
+ { "thetasym", 0x03d1 },
+ { "thinsp", 0x2009 },
+ { "thorn", 0x00fe },
+ { "tilde", 0x02dc },
+ { "times", 0x00d7 },
+ { "trade", 0x2122 },
+ { "uArr", 0x21d1 },
+ { "uacute", 0x00fa },
+ { "uarr", 0x2191 },
+ { "ucirc", 0x00fb },
+ { "ugrave", 0x00f9 },
+ { "uml", 0x00a8 },
+ { "upsih", 0x03d2 },
+ { "upsilon", 0x03c5 },
+ { "uuml", 0x00fc },
+ { "weierp", 0x2118 },
+ { "xi", 0x03be },
+ { "yacute", 0x00fd },
+ { "yen", 0x00a5 },
+ { "yuml", 0x00ff },
+ { "zeta", 0x03b6 },
+ { "zwj", 0x200d },
+ { "zwnj", 0x200c }
+};
+
+Q_STATIC_GLOBAL_OPERATOR bool operator<(const QString &entityStr, const QTextHtmlEntity &entity)
+{
+ return entityStr < QLatin1String(entity.name);
+}
+
+Q_STATIC_GLOBAL_OPERATOR bool operator<(const QTextHtmlEntity &entity, const QString &entityStr)
+{
+ return QLatin1String(entity.name) < entityStr;
+}
+
+static QChar resolveEntity(const QString &entity)
+{
+ const QTextHtmlEntity *start = &entities[0];
+ const QTextHtmlEntity *end = &entities[(sizeof(entities) / sizeof(entities[0]))];
+ const QTextHtmlEntity *e = qBinaryFind(start, end, entity);
+ if (e == end)
+ return QChar();
+ return e->code;
+}
+
+static const uint latin1Extended[0xA0 - 0x80] = {
+ 0x20ac, // 0x80
+ 0x0081, // 0x81 direct mapping
+ 0x201a, // 0x82
+ 0x0192, // 0x83
+ 0x201e, // 0x84
+ 0x2026, // 0x85
+ 0x2020, // 0x86
+ 0x2021, // 0x87
+ 0x02C6, // 0x88
+ 0x2030, // 0x89
+ 0x0160, // 0x8A
+ 0x2039, // 0x8B
+ 0x0152, // 0x8C
+ 0x008D, // 0x8D direct mapping
+ 0x017D, // 0x8E
+ 0x008F, // 0x8F directmapping
+ 0x0090, // 0x90 directmapping
+ 0x2018, // 0x91
+ 0x2019, // 0x92
+ 0x201C, // 0x93
+ 0X201D, // 0x94
+ 0x2022, // 0x95
+ 0x2013, // 0x96
+ 0x2014, // 0x97
+ 0x02DC, // 0x98
+ 0x2122, // 0x99
+ 0x0161, // 0x9A
+ 0x203A, // 0x9B
+ 0x0153, // 0x9C
+ 0x009D, // 0x9D direct mapping
+ 0x017E, // 0x9E
+ 0x0178 // 0x9F
+};
+// end taken from qtexthtmlparser
+
class DocumentHelper
{
public:
- DocumentHelper(const QString& fileName, const QByteArray &data)
+ DocumentHelper(const QString &fileName, const QByteArray &data)
: fileName(fileName) , data(readData(data)) {}
~DocumentHelper() {}
@@ -131,8 +455,15 @@ private:
while (j < length) {
c = buf[j++];
if (c == QLatin1Char('<') || c == QLatin1Char('&')) {
- if (count > 1)
+ if (count > 1 && c != QLatin1Char('&'))
parsedContent.append(QLatin1Char(' '));
+ else if (c == QLatin1Char('&')) {
+ // Note: this will modify the counter j, in case we sucessful parsed the entity
+ // we will have modified the counter to stay 1 before the closing ';', so
+ // the following if condition will be met with if (c == QLatin1Char(';'))
+ parsedContent.append(parseEntity(length, buf, j));
+ }
+
count = 0;
valid = false;
continue;
@@ -157,6 +488,63 @@ private:
return parsedContent;
}
+ // taken from qtexthtmlparser
+ // parses an entity after "&", and returns it
+ QString parseEntity(int len, const QChar *buf, int &pos) const
+ {
+ int recover = pos;
+ QString entity;
+ while (pos < len) {
+ QChar c = buf[pos++];
+ if (c.isSpace() || pos - recover > 9) {
+ goto error;
+ }
+ if (c == QLatin1Char(';')) {
+ pos--;
+ break;
+ }
+ entity += c;
+ }
+ {
+ QChar resolved = resolveEntity(entity);
+ if (!resolved.isNull())
+ return QString(resolved);
+ }
+ if (entity.length() > 1 && entity.at(0) == QLatin1Char('#')) {
+ entity.remove(0, 1); // removing leading #
+
+ int base = 10;
+ bool ok = false;
+
+ if (entity.at(0).toLower() == QLatin1Char('x')) { // hex entity?
+ entity.remove(0, 1);
+ base = 16;
+ }
+
+ uint uc = entity.toUInt(&ok, base);
+ if (ok) {
+ if (uc >= 0x80 && uc < 0x80 + (sizeof(latin1Extended) / sizeof(latin1Extended[0])))
+ uc = latin1Extended[uc - 0x80]; // windows latin 1 extended
+ QString str;
+ if (uc > 0xffff) {
+ // surrogate pair
+ uc -= 0x10000;
+ ushort high = uc/0x400 + 0xd800;
+ ushort low = uc%0x400 + 0xdc00;
+ str.append(QChar(high));
+ str.append(QChar(low));
+ } else {
+ str.append(QChar(uc));
+ }
+ return str;
+ }
+ }
+ error:
+ pos = recover;
+ return QLatin1String(" ");
+ }
+ // end taken from qtexthtmlparser
+
private:
QString fileName;
QString data;
@@ -239,7 +627,7 @@ void QHelpSearchIndexWriter::run()
// old style qhc file < 4.4.2, need to convert...
const QStringList indexedNamespaces = engine.customValue(oldKey).
toString().split(QLatin1String("|"), QString::SkipEmptyParts);
- foreach (const QString& nameSpace, indexedNamespaces)
+ foreach (const QString &nameSpace, indexedNamespaces)
indexMap.insert(nameSpace, QDateTime());
engine.removeCustomValue(oldKey);
} else {
@@ -294,7 +682,7 @@ void QHelpSearchIndexWriter::run()
}
if (QCLuceneIndexReader::indexExists(indexPath) && !reindex) {
- foreach(const QString& namespaceName, registeredDocs) {
+ foreach(const QString &namespaceName, registeredDocs) {
mutexLocker.relock();
if (m_cancel) {
emit indexingFinished();
@@ -342,7 +730,7 @@ void QHelpSearchIndexWriter::run()
writer->setMaxFieldLength(QCLuceneIndexWriter::DEFAULT_MAX_FIELD_LENGTH);
QStringList namespaces;
- foreach(const QString& namespaceName, registeredDocs) {
+ foreach(const QString &namespaceName, registeredDocs) {
mutexLocker.relock();
if (m_cancel) {
writer->close();
@@ -367,7 +755,7 @@ void QHelpSearchIndexWriter::run()
break;
} else {
bool bail = false;
- foreach (const QStringList& attributes, attributeSets) {
+ foreach (const QStringList &attributes, attributeSets) {
const QList<QUrl> docFiles = indexableFiles(&engine,
namespaceName, attributes);
if (!addDocuments(docFiles, engine, attributes, namespaceName,
@@ -397,7 +785,7 @@ void QHelpSearchIndexWriter::run()
mutexLocker.unlock();
QStringList indexedNamespaces = indexMap.keys();
- foreach(const QString& namespaceName, indexedNamespaces) {
+ foreach(const QString &namespaceName, indexedNamespaces) {
mutexLocker.relock();
if (m_cancel)
break;
@@ -422,7 +810,7 @@ bool QHelpSearchIndexWriter::addDocuments(const QList<QUrl> docFiles,
const QString attrList = attributes.join(QLatin1String(" "));
locker.unlock();
- foreach(const QUrl& url, docFiles) {
+ foreach(const QUrl &url, docFiles) {
QCLuceneDocument document;
DocumentHelper helper(url.toString(), engine.fileData(url));
if (helper.addFieldsToDocument(&document, namespaceName, attrList))
@@ -450,8 +838,8 @@ void QHelpSearchIndexWriter::removeDocuments(const QString &indexPath,
reader.close();
}
-bool QHelpSearchIndexWriter::writeIndexMap(QHelpEngineCore& engine,
- const QMap<QString, QDateTime>& indexMap)
+bool QHelpSearchIndexWriter::writeIndexMap(QHelpEngineCore &engine,
+ const QMap<QString, QDateTime> &indexMap)
{
QByteArray bArray;