diff options
author | kh <qtc-committer@nokia.com> | 2009-03-31 09:49:24 (GMT) |
---|---|---|
committer | kh <qtc-committer@nokia.com> | 2009-03-31 09:49:24 (GMT) |
commit | b311ebf246a0af16f851dd16c92594c238a0048f (patch) | |
tree | ae73343feecec85b75554c15a176579ce35479de /tools/assistant | |
parent | 3d370dc92466b010bfe6cdd82411f7e3d13da7c1 (diff) | |
download | Qt-b311ebf246a0af16f851dd16c92594c238a0048f.zip Qt-b311ebf246a0af16f851dd16c92594c238a0048f.tar.gz Qt-b311ebf246a0af16f851dd16c92594c238a0048f.tar.bz2 |
fixes search for non latin1 characters
while parsing html files, we removed some entities the should have been
converted to there native representation, thus we were not able to find
them
Task-number: 247732
Reviewed-by: hjk
Diffstat (limited to 'tools/assistant')
3 files changed, 482 insertions, 55 deletions
diff --git a/tools/assistant/lib/qhelpsearchindexreader_clucene.cpp b/tools/assistant/lib/qhelpsearchindexreader_clucene.cpp index 82a3a17..a1ba001 100644 --- a/tools/assistant/lib/qhelpsearchindexreader_clucene.cpp +++ b/tools/assistant/lib/qhelpsearchindexreader_clucene.cpp @@ -83,9 +83,8 @@ void QHelpSearchIndexReader::cancelSearching() mutex.unlock(); } -void QHelpSearchIndexReader::search(const QString &collectionFile, - const QString &indexFilesFolder, - const QList<QHelpSearchQuery> &queryList) +void QHelpSearchIndexReader::search(const QString &collectionFile, const QString &indexFilesFolder, + const QList<QHelpSearchQuery> &queryList) { QMutexLocker lock(&mutex); @@ -147,17 +146,16 @@ void QHelpSearchIndexReader::run() try { #endif QCLuceneBooleanQuery booleanQuery; - if (!buildQuery(booleanQuery, queryList)) { + QCLuceneStandardAnalyzer analyzer; + if (!buildQuery(booleanQuery, queryList, analyzer)) { emit searchingFinished(0); return; } const QStringList attribList = engine.filterAttributes(engine.currentFilter()); if (!attribList.isEmpty()) { - QCLuceneStandardAnalyzer analyzer; QCLuceneQuery* query = QCLuceneQueryParser::parse(QLatin1String("+") - + attribList.join(QLatin1String(" +")), QLatin1String("attribute"), - analyzer); + + attribList.join(QLatin1String(" +")), QLatin1String("attribute"), analyzer); if (!query) { emit searchingFinished(0); @@ -168,10 +166,26 @@ void QHelpSearchIndexReader::run() QCLuceneIndexSearcher indexSearcher(indexPath); QCLuceneHits hits = indexSearcher.search(booleanQuery); - const QStringList namespaceList = engine.registeredDocumentations(); + + bool boost = true; + QCLuceneBooleanQuery tryHarderQuery; + if (hits.length() == 0) { + if (buildTryHarderQuery(tryHarderQuery, queryList, analyzer)) { + if (!attribList.isEmpty()) { + QCLuceneQuery* query = QCLuceneQueryParser::parse(QLatin1String("+") + + attribList.join(QLatin1String(" +")), QLatin1String("attribute"), + analyzer); + tryHarderQuery.add(query, true, true, false); + } + hits = indexSearcher.search(tryHarderQuery); + boost = (hits.length() == 0); + } + } QSet<QString> pathSet; QCLuceneDocument document; + const QStringList namespaceList = engine.registeredDocumentations(); + for (qint32 i = 0; i < hits.length(); i++) { document = hits.document(i); const QString path = document.get(QLatin1String("path")); @@ -192,8 +206,8 @@ void QHelpSearchIndexReader::run() } indexSearcher.close(); - int count = hitList.count(); - if (count > 0) + const int count = hitList.count(); + if ((count > 0) && boost) boostSearchHits(engine, hitList, queryList); emit searchingFinished(hitList.count()); @@ -206,11 +220,9 @@ void QHelpSearchIndexReader::run() } } -bool QHelpSearchIndexReader::defaultQuery(const QString &term, - QCLuceneBooleanQuery &booleanQuery) +bool QHelpSearchIndexReader::defaultQuery(const QString &term, QCLuceneBooleanQuery &booleanQuery, + QCLuceneStandardAnalyzer &analyzer) { - QCLuceneStandardAnalyzer analyzer; - const QLatin1String c("content"); const QLatin1String t("titleTokenized"); @@ -226,21 +238,23 @@ bool QHelpSearchIndexReader::defaultQuery(const QString &term, } bool QHelpSearchIndexReader::buildQuery(QCLuceneBooleanQuery &booleanQuery, - const QList<QHelpSearchQuery> &queryList) + const QList<QHelpSearchQuery> &queryList, QCLuceneStandardAnalyzer &analyzer) { foreach (const QHelpSearchQuery query, queryList) { switch (query.fieldName) { case QHelpSearchQuery::FUZZY: { const QLatin1String fuzzy("~"); - foreach (const QString term, query.wordList) { - if (term.isEmpty() || !defaultQuery(term.toLower() + fuzzy, booleanQuery)) + foreach (const QString &term, query.wordList) { + if (term.isEmpty() + || !defaultQuery(term.toLower() + fuzzy, booleanQuery, analyzer)) { return false; + } } } break; case QHelpSearchQuery::WITHOUT: { QStringList stopWords = QCLuceneStopAnalyzer().englishStopWords(); - foreach (const QString term, query.wordList) { + foreach (const QString &term, query.wordList) { if (stopWords.contains(term, Qt::CaseInsensitive)) continue; @@ -259,14 +273,14 @@ bool QHelpSearchIndexReader::buildQuery(QCLuceneBooleanQuery &booleanQuery, } break; case QHelpSearchQuery::PHRASE: { - const QString term = query.wordList.at(0).toLower(); + const QString &term = query.wordList.at(0).toLower(); if (term.contains(QLatin1Char(' '))) { QStringList termList = term.split(QLatin1String(" ")); QCLucenePhraseQuery *q = new QCLucenePhraseQuery(); QStringList stopWords = QCLuceneStopAnalyzer().englishStopWords(); - foreach (const QString t, termList) { - if (!stopWords.contains(t, Qt::CaseInsensitive)) - q->addTerm(QCLuceneTerm(QLatin1String("content"), t.toLower())); + foreach (const QString &term, termList) { + if (!stopWords.contains(term, Qt::CaseInsensitive)) + q->addTerm(QCLuceneTerm(QLatin1String("content"), term.toLower())); } booleanQuery.add(q, true, true, false); } else { @@ -286,7 +300,7 @@ bool QHelpSearchIndexReader::buildQuery(QCLuceneBooleanQuery &booleanQuery, case QHelpSearchQuery::ALL: { QStringList stopWords = QCLuceneStopAnalyzer().englishStopWords(); - foreach (const QString term, query.wordList) { + foreach (const QString &term, query.wordList) { if (stopWords.contains(term, Qt::CaseInsensitive)) continue; @@ -302,9 +316,8 @@ bool QHelpSearchIndexReader::buildQuery(QCLuceneBooleanQuery &booleanQuery, } break; case QHelpSearchQuery::DEFAULT: { - QCLuceneStandardAnalyzer analyzer; - foreach (const QString t, query.wordList) { - QCLuceneQuery *query = QCLuceneQueryParser::parse(t.toLower(), + foreach (const QString &term, query.wordList) { + QCLuceneQuery *query = QCLuceneQueryParser::parse(term.toLower(), QLatin1String("content"), analyzer); if (query) @@ -313,8 +326,8 @@ bool QHelpSearchIndexReader::buildQuery(QCLuceneBooleanQuery &booleanQuery, } break; case QHelpSearchQuery::ATLEAST: { - foreach (const QString term, query.wordList) { - if (term.isEmpty() || !defaultQuery(term.toLower(), booleanQuery)) + foreach (const QString &term, query.wordList) { + if (term.isEmpty() || !defaultQuery(term.toLower(), booleanQuery, analyzer)) return false; } } @@ -324,16 +337,38 @@ bool QHelpSearchIndexReader::buildQuery(QCLuceneBooleanQuery &booleanQuery, return true; } +bool QHelpSearchIndexReader::buildTryHarderQuery(QCLuceneBooleanQuery &booleanQuery, + const QList<QHelpSearchQuery> &queryList, QCLuceneStandardAnalyzer &analyzer) +{ + bool retVal = false; + foreach (const QHelpSearchQuery query, queryList) { + switch (query.fieldName) { + default: break; + case QHelpSearchQuery::DEFAULT: { + foreach (const QString &term, query.wordList) { + QCLuceneQuery *query = QCLuceneQueryParser::parse(term.toLower(), + QLatin1String("content"), analyzer); + + if (query) { + retVal = true; + booleanQuery.add(query, true, false, false); + } + } + } break; + } + } + return retVal; +} + void QHelpSearchIndexReader::boostSearchHits(const QHelpEngineCore &engine, - QList<QHelpSearchEngine::SearchHit> &hitList, - const QList<QHelpSearchQuery> &queryList) + QList<QHelpSearchEngine::SearchHit> &hitList, const QList<QHelpSearchQuery> &queryList) { foreach (const QHelpSearchQuery query, queryList) { if (query.fieldName != QHelpSearchQuery::DEFAULT) continue; QString joinedQuery = query.wordList.join(QLatin1String(" ")); - + QCLuceneStandardAnalyzer analyzer; QCLuceneQuery *parsedQuery = QCLuceneQueryParser::parse( joinedQuery, QLatin1String("content"), analyzer); @@ -351,8 +386,7 @@ void QHelpSearchIndexReader::boostSearchHits(const QHelpEngineCore &engine, QStringList searchTerms; while (index != -1) { nextIndex = joinedQuery.indexOf(QLatin1String("content:"), index + 1); - term = joinedQuery.mid(index + length, nextIndex - (length + index)) - .simplified(); + term = joinedQuery.mid(index + length, nextIndex - (length + index)).simplified(); if (term.startsWith(QLatin1String("\"")) && term.endsWith(QLatin1String("\""))) { searchTerms.append(term.remove(QLatin1String("\""))); @@ -370,17 +404,19 @@ void QHelpSearchIndexReader::boostSearchHits(const QHelpEngineCore &engine, QString data = QString::fromUtf8(engine.fileData(hit.first)); int counter = 0; - foreach (const QString& term, searchTerms) + foreach (const QString &term, searchTerms) counter += data.count(term, Qt::CaseInsensitive); hitMap.insertMulti(counter, hit); } QList<QHelpSearchEngine::SearchHit> boostedList; - QMap<int, QHelpSearchEngine::SearchHit>::const_iterator i; - for (i = hitMap.constEnd(), --i; i != hitMap.constBegin(); --i) - boostedList.append(i.value()); - boostedList += hitList.mid(count - 1, hitList.count()); - + QMap<int, QHelpSearchEngine::SearchHit>::const_iterator it = hitMap.constEnd(); + do { + --it; + boostedList.append(it.value()); + } while (it != hitMap.constBegin()); + boostedList += hitList.mid(count, hitList.count()); + hitList = boostedList; } } diff --git a/tools/assistant/lib/qhelpsearchindexreader_clucene_p.h b/tools/assistant/lib/qhelpsearchindexreader_clucene_p.h index 892c4e6..f7536e9 100644 --- a/tools/assistant/lib/qhelpsearchindexreader_clucene_p.h +++ b/tools/assistant/lib/qhelpsearchindexreader_clucene_p.h @@ -54,6 +54,8 @@ // #include "qhelpsearchengine.h" + +#include "fulltextsearch/qanalyzer_p.h" #include "fulltextsearch/qquery_p.h" #include <QtCore/QList> @@ -93,12 +95,13 @@ signals: private: void run(); - bool defaultQuery(const QString &term, - QCLuceneBooleanQuery &booleanQuery); - bool buildQuery(QCLuceneBooleanQuery &booleanQuery, - const QList<QHelpSearchQuery> &queryList); - void boostSearchHits(const QHelpEngineCore &engine, - QList<QHelpSearchEngine::SearchHit> &hitList, + bool defaultQuery(const QString &term, QCLuceneBooleanQuery &booleanQuery, + QCLuceneStandardAnalyzer &analyzer); + bool buildQuery(QCLuceneBooleanQuery &booleanQuery, const QList<QHelpSearchQuery> &queryList, + QCLuceneStandardAnalyzer &analyzer); + bool buildTryHarderQuery(QCLuceneBooleanQuery &booleanQuery, + const QList<QHelpSearchQuery> &queryList, QCLuceneStandardAnalyzer &analyzer); + void boostSearchHits(const QHelpEngineCore &engine, QList<QHelpSearchEngine::SearchHit> &hitList, const QList<QHelpSearchQuery> &queryList); private: diff --git a/tools/assistant/lib/qhelpsearchindexwriter_clucene.cpp b/tools/assistant/lib/qhelpsearchindexwriter_clucene.cpp index e53bbae..195c490 100644 --- a/tools/assistant/lib/qhelpsearchindexwriter_clucene.cpp +++ b/tools/assistant/lib/qhelpsearchindexwriter_clucene.cpp @@ -59,16 +59,340 @@ #include <QtNetwork/QLocalSocket> #include <QtNetwork/QLocalServer> +#include "private/qfunctions_p.h" + QT_BEGIN_NAMESPACE namespace qt { namespace fulltextsearch { namespace clucene { +// taken from qtexthtmlparser +static const struct QTextHtmlEntity +{ + const char *name; + quint16 code; +} entities[] = { + { "AElig", 0x00c6 }, + { "AMP", 38 }, + { "Aacute", 0x00c1 }, + { "Acirc", 0x00c2 }, + { "Agrave", 0x00c0 }, + { "Alpha", 0x0391 }, + { "Aring", 0x00c5 }, + { "Atilde", 0x00c3 }, + { "Auml", 0x00c4 }, + { "Beta", 0x0392 }, + { "Ccedil", 0x00c7 }, + { "Chi", 0x03a7 }, + { "Dagger", 0x2021 }, + { "Delta", 0x0394 }, + { "ETH", 0x00d0 }, + { "Eacute", 0x00c9 }, + { "Ecirc", 0x00ca }, + { "Egrave", 0x00c8 }, + { "Epsilon", 0x0395 }, + { "Eta", 0x0397 }, + { "Euml", 0x00cb }, + { "GT", 62 }, + { "Gamma", 0x0393 }, + { "Iacute", 0x00cd }, + { "Icirc", 0x00ce }, + { "Igrave", 0x00cc }, + { "Iota", 0x0399 }, + { "Iuml", 0x00cf }, + { "Kappa", 0x039a }, + { "LT", 60 }, + { "Lambda", 0x039b }, + { "Mu", 0x039c }, + { "Ntilde", 0x00d1 }, + { "Nu", 0x039d }, + { "OElig", 0x0152 }, + { "Oacute", 0x00d3 }, + { "Ocirc", 0x00d4 }, + { "Ograve", 0x00d2 }, + { "Omega", 0x03a9 }, + { "Omicron", 0x039f }, + { "Oslash", 0x00d8 }, + { "Otilde", 0x00d5 }, + { "Ouml", 0x00d6 }, + { "Phi", 0x03a6 }, + { "Pi", 0x03a0 }, + { "Prime", 0x2033 }, + { "Psi", 0x03a8 }, + { "QUOT", 34 }, + { "Rho", 0x03a1 }, + { "Scaron", 0x0160 }, + { "Sigma", 0x03a3 }, + { "THORN", 0x00de }, + { "Tau", 0x03a4 }, + { "Theta", 0x0398 }, + { "Uacute", 0x00da }, + { "Ucirc", 0x00db }, + { "Ugrave", 0x00d9 }, + { "Upsilon", 0x03a5 }, + { "Uuml", 0x00dc }, + { "Xi", 0x039e }, + { "Yacute", 0x00dd }, + { "Yuml", 0x0178 }, + { "Zeta", 0x0396 }, + { "aacute", 0x00e1 }, + { "acirc", 0x00e2 }, + { "acute", 0x00b4 }, + { "aelig", 0x00e6 }, + { "agrave", 0x00e0 }, + { "alefsym", 0x2135 }, + { "alpha", 0x03b1 }, + { "amp", 38 }, + { "and", 0x22a5 }, + { "ang", 0x2220 }, + { "apos", 0x0027 }, + { "aring", 0x00e5 }, + { "asymp", 0x2248 }, + { "atilde", 0x00e3 }, + { "auml", 0x00e4 }, + { "bdquo", 0x201e }, + { "beta", 0x03b2 }, + { "brvbar", 0x00a6 }, + { "bull", 0x2022 }, + { "cap", 0x2229 }, + { "ccedil", 0x00e7 }, + { "cedil", 0x00b8 }, + { "cent", 0x00a2 }, + { "chi", 0x03c7 }, + { "circ", 0x02c6 }, + { "clubs", 0x2663 }, + { "cong", 0x2245 }, + { "copy", 0x00a9 }, + { "crarr", 0x21b5 }, + { "cup", 0x222a }, + { "curren", 0x00a4 }, + { "dArr", 0x21d3 }, + { "dagger", 0x2020 }, + { "darr", 0x2193 }, + { "deg", 0x00b0 }, + { "delta", 0x03b4 }, + { "diams", 0x2666 }, + { "divide", 0x00f7 }, + { "eacute", 0x00e9 }, + { "ecirc", 0x00ea }, + { "egrave", 0x00e8 }, + { "empty", 0x2205 }, + { "emsp", 0x2003 }, + { "ensp", 0x2002 }, + { "epsilon", 0x03b5 }, + { "equiv", 0x2261 }, + { "eta", 0x03b7 }, + { "eth", 0x00f0 }, + { "euml", 0x00eb }, + { "euro", 0x20ac }, + { "exist", 0x2203 }, + { "fnof", 0x0192 }, + { "forall", 0x2200 }, + { "frac12", 0x00bd }, + { "frac14", 0x00bc }, + { "frac34", 0x00be }, + { "frasl", 0x2044 }, + { "gamma", 0x03b3 }, + { "ge", 0x2265 }, + { "gt", 62 }, + { "hArr", 0x21d4 }, + { "harr", 0x2194 }, + { "hearts", 0x2665 }, + { "hellip", 0x2026 }, + { "iacute", 0x00ed }, + { "icirc", 0x00ee }, + { "iexcl", 0x00a1 }, + { "igrave", 0x00ec }, + { "image", 0x2111 }, + { "infin", 0x221e }, + { "int", 0x222b }, + { "iota", 0x03b9 }, + { "iquest", 0x00bf }, + { "isin", 0x2208 }, + { "iuml", 0x00ef }, + { "kappa", 0x03ba }, + { "lArr", 0x21d0 }, + { "lambda", 0x03bb }, + { "lang", 0x2329 }, + { "laquo", 0x00ab }, + { "larr", 0x2190 }, + { "lceil", 0x2308 }, + { "ldquo", 0x201c }, + { "le", 0x2264 }, + { "lfloor", 0x230a }, + { "lowast", 0x2217 }, + { "loz", 0x25ca }, + { "lrm", 0x200e }, + { "lsaquo", 0x2039 }, + { "lsquo", 0x2018 }, + { "lt", 60 }, + { "macr", 0x00af }, + { "mdash", 0x2014 }, + { "micro", 0x00b5 }, + { "middot", 0x00b7 }, + { "minus", 0x2212 }, + { "mu", 0x03bc }, + { "nabla", 0x2207 }, + { "nbsp", 0x00a0 }, + { "ndash", 0x2013 }, + { "ne", 0x2260 }, + { "ni", 0x220b }, + { "not", 0x00ac }, + { "notin", 0x2209 }, + { "nsub", 0x2284 }, + { "ntilde", 0x00f1 }, + { "nu", 0x03bd }, + { "oacute", 0x00f3 }, + { "ocirc", 0x00f4 }, + { "oelig", 0x0153 }, + { "ograve", 0x00f2 }, + { "oline", 0x203e }, + { "omega", 0x03c9 }, + { "omicron", 0x03bf }, + { "oplus", 0x2295 }, + { "or", 0x22a6 }, + { "ordf", 0x00aa }, + { "ordm", 0x00ba }, + { "oslash", 0x00f8 }, + { "otilde", 0x00f5 }, + { "otimes", 0x2297 }, + { "ouml", 0x00f6 }, + { "para", 0x00b6 }, + { "part", 0x2202 }, + { "percnt", 0x0025 }, + { "permil", 0x2030 }, + { "perp", 0x22a5 }, + { "phi", 0x03c6 }, + { "pi", 0x03c0 }, + { "piv", 0x03d6 }, + { "plusmn", 0x00b1 }, + { "pound", 0x00a3 }, + { "prime", 0x2032 }, + { "prod", 0x220f }, + { "prop", 0x221d }, + { "psi", 0x03c8 }, + { "quot", 34 }, + { "rArr", 0x21d2 }, + { "radic", 0x221a }, + { "rang", 0x232a }, + { "raquo", 0x00bb }, + { "rarr", 0x2192 }, + { "rceil", 0x2309 }, + { "rdquo", 0x201d }, + { "real", 0x211c }, + { "reg", 0x00ae }, + { "rfloor", 0x230b }, + { "rho", 0x03c1 }, + { "rlm", 0x200f }, + { "rsaquo", 0x203a }, + { "rsquo", 0x2019 }, + { "sbquo", 0x201a }, + { "scaron", 0x0161 }, + { "sdot", 0x22c5 }, + { "sect", 0x00a7 }, + { "shy", 0x00ad }, + { "sigma", 0x03c3 }, + { "sigmaf", 0x03c2 }, + { "sim", 0x223c }, + { "spades", 0x2660 }, + { "sub", 0x2282 }, + { "sube", 0x2286 }, + { "sum", 0x2211 }, + { "sup", 0x2283 }, + { "sup1", 0x00b9 }, + { "sup2", 0x00b2 }, + { "sup3", 0x00b3 }, + { "supe", 0x2287 }, + { "szlig", 0x00df }, + { "tau", 0x03c4 }, + { "there4", 0x2234 }, + { "theta", 0x03b8 }, + { "thetasym", 0x03d1 }, + { "thinsp", 0x2009 }, + { "thorn", 0x00fe }, + { "tilde", 0x02dc }, + { "times", 0x00d7 }, + { "trade", 0x2122 }, + { "uArr", 0x21d1 }, + { "uacute", 0x00fa }, + { "uarr", 0x2191 }, + { "ucirc", 0x00fb }, + { "ugrave", 0x00f9 }, + { "uml", 0x00a8 }, + { "upsih", 0x03d2 }, + { "upsilon", 0x03c5 }, + { "uuml", 0x00fc }, + { "weierp", 0x2118 }, + { "xi", 0x03be }, + { "yacute", 0x00fd }, + { "yen", 0x00a5 }, + { "yuml", 0x00ff }, + { "zeta", 0x03b6 }, + { "zwj", 0x200d }, + { "zwnj", 0x200c } +}; + +Q_STATIC_GLOBAL_OPERATOR bool operator<(const QString &entityStr, const QTextHtmlEntity &entity) +{ + return entityStr < QLatin1String(entity.name); +} + +Q_STATIC_GLOBAL_OPERATOR bool operator<(const QTextHtmlEntity &entity, const QString &entityStr) +{ + return QLatin1String(entity.name) < entityStr; +} + +static QChar resolveEntity(const QString &entity) +{ + const QTextHtmlEntity *start = &entities[0]; + const QTextHtmlEntity *end = &entities[(sizeof(entities) / sizeof(entities[0]))]; + const QTextHtmlEntity *e = qBinaryFind(start, end, entity); + if (e == end) + return QChar(); + return e->code; +} + +static const uint latin1Extended[0xA0 - 0x80] = { + 0x20ac, // 0x80 + 0x0081, // 0x81 direct mapping + 0x201a, // 0x82 + 0x0192, // 0x83 + 0x201e, // 0x84 + 0x2026, // 0x85 + 0x2020, // 0x86 + 0x2021, // 0x87 + 0x02C6, // 0x88 + 0x2030, // 0x89 + 0x0160, // 0x8A + 0x2039, // 0x8B + 0x0152, // 0x8C + 0x008D, // 0x8D direct mapping + 0x017D, // 0x8E + 0x008F, // 0x8F directmapping + 0x0090, // 0x90 directmapping + 0x2018, // 0x91 + 0x2019, // 0x92 + 0x201C, // 0x93 + 0X201D, // 0x94 + 0x2022, // 0x95 + 0x2013, // 0x96 + 0x2014, // 0x97 + 0x02DC, // 0x98 + 0x2122, // 0x99 + 0x0161, // 0x9A + 0x203A, // 0x9B + 0x0153, // 0x9C + 0x009D, // 0x9D direct mapping + 0x017E, // 0x9E + 0x0178 // 0x9F +}; +// end taken from qtexthtmlparser + class DocumentHelper { public: - DocumentHelper(const QString& fileName, const QByteArray &data) + DocumentHelper(const QString &fileName, const QByteArray &data) : fileName(fileName) , data(readData(data)) {} ~DocumentHelper() {} @@ -131,8 +455,15 @@ private: while (j < length) { c = buf[j++]; if (c == QLatin1Char('<') || c == QLatin1Char('&')) { - if (count > 1) + if (count > 1 && c != QLatin1Char('&')) parsedContent.append(QLatin1Char(' ')); + else if (c == QLatin1Char('&')) { + // Note: this will modify the counter j, in case we sucessful parsed the entity + // we will have modified the counter to stay 1 before the closing ';', so + // the following if condition will be met with if (c == QLatin1Char(';')) + parsedContent.append(parseEntity(length, buf, j)); + } + count = 0; valid = false; continue; @@ -157,6 +488,63 @@ private: return parsedContent; } + // taken from qtexthtmlparser + // parses an entity after "&", and returns it + QString parseEntity(int len, const QChar *buf, int &pos) const + { + int recover = pos; + QString entity; + while (pos < len) { + QChar c = buf[pos++]; + if (c.isSpace() || pos - recover > 9) { + goto error; + } + if (c == QLatin1Char(';')) { + pos--; + break; + } + entity += c; + } + { + QChar resolved = resolveEntity(entity); + if (!resolved.isNull()) + return QString(resolved); + } + if (entity.length() > 1 && entity.at(0) == QLatin1Char('#')) { + entity.remove(0, 1); // removing leading # + + int base = 10; + bool ok = false; + + if (entity.at(0).toLower() == QLatin1Char('x')) { // hex entity? + entity.remove(0, 1); + base = 16; + } + + uint uc = entity.toUInt(&ok, base); + if (ok) { + if (uc >= 0x80 && uc < 0x80 + (sizeof(latin1Extended) / sizeof(latin1Extended[0]))) + uc = latin1Extended[uc - 0x80]; // windows latin 1 extended + QString str; + if (uc > 0xffff) { + // surrogate pair + uc -= 0x10000; + ushort high = uc/0x400 + 0xd800; + ushort low = uc%0x400 + 0xdc00; + str.append(QChar(high)); + str.append(QChar(low)); + } else { + str.append(QChar(uc)); + } + return str; + } + } + error: + pos = recover; + return QLatin1String(" "); + } + // end taken from qtexthtmlparser + private: QString fileName; QString data; @@ -239,7 +627,7 @@ void QHelpSearchIndexWriter::run() // old style qhc file < 4.4.2, need to convert... const QStringList indexedNamespaces = engine.customValue(oldKey). toString().split(QLatin1String("|"), QString::SkipEmptyParts); - foreach (const QString& nameSpace, indexedNamespaces) + foreach (const QString &nameSpace, indexedNamespaces) indexMap.insert(nameSpace, QDateTime()); engine.removeCustomValue(oldKey); } else { @@ -294,7 +682,7 @@ void QHelpSearchIndexWriter::run() } if (QCLuceneIndexReader::indexExists(indexPath) && !reindex) { - foreach(const QString& namespaceName, registeredDocs) { + foreach(const QString &namespaceName, registeredDocs) { mutexLocker.relock(); if (m_cancel) { emit indexingFinished(); @@ -342,7 +730,7 @@ void QHelpSearchIndexWriter::run() writer->setMaxFieldLength(QCLuceneIndexWriter::DEFAULT_MAX_FIELD_LENGTH); QStringList namespaces; - foreach(const QString& namespaceName, registeredDocs) { + foreach(const QString &namespaceName, registeredDocs) { mutexLocker.relock(); if (m_cancel) { writer->close(); @@ -367,7 +755,7 @@ void QHelpSearchIndexWriter::run() break; } else { bool bail = false; - foreach (const QStringList& attributes, attributeSets) { + foreach (const QStringList &attributes, attributeSets) { const QList<QUrl> docFiles = indexableFiles(&engine, namespaceName, attributes); if (!addDocuments(docFiles, engine, attributes, namespaceName, @@ -397,7 +785,7 @@ void QHelpSearchIndexWriter::run() mutexLocker.unlock(); QStringList indexedNamespaces = indexMap.keys(); - foreach(const QString& namespaceName, indexedNamespaces) { + foreach(const QString &namespaceName, indexedNamespaces) { mutexLocker.relock(); if (m_cancel) break; @@ -422,7 +810,7 @@ bool QHelpSearchIndexWriter::addDocuments(const QList<QUrl> docFiles, const QString attrList = attributes.join(QLatin1String(" ")); locker.unlock(); - foreach(const QUrl& url, docFiles) { + foreach(const QUrl &url, docFiles) { QCLuceneDocument document; DocumentHelper helper(url.toString(), engine.fileData(url)); if (helper.addFieldsToDocument(&document, namespaceName, attrList)) @@ -450,8 +838,8 @@ void QHelpSearchIndexWriter::removeDocuments(const QString &indexPath, reader.close(); } -bool QHelpSearchIndexWriter::writeIndexMap(QHelpEngineCore& engine, - const QMap<QString, QDateTime>& indexMap) +bool QHelpSearchIndexWriter::writeIndexMap(QHelpEngineCore &engine, + const QMap<QString, QDateTime> &indexMap) { QByteArray bArray; |