summaryrefslogtreecommitdiffstats
path: root/tools/linguist/shared/translatortools.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'tools/linguist/shared/translatortools.cpp')
-rw-r--r--tools/linguist/shared/translatortools.cpp505
1 files changed, 505 insertions, 0 deletions
diff --git a/tools/linguist/shared/translatortools.cpp b/tools/linguist/shared/translatortools.cpp
new file mode 100644
index 0000000..dcff546
--- /dev/null
+++ b/tools/linguist/shared/translatortools.cpp
@@ -0,0 +1,505 @@
+/****************************************************************************
+**
+** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
+** Contact: Qt Software Information (qt-info@nokia.com)
+**
+** This file is part of the Qt Linguist of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** No Commercial Usage
+** This file contains pre-release code and may not be distributed.
+** You may use this file in accordance with the terms and conditions
+** contained in the either Technology Preview License Agreement or the
+** Beta Release License Agreement.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Nokia gives you certain
+** additional rights. These rights are described in the Nokia Qt LGPL
+** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this
+** package.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 3.0 as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU General Public License version 3.0 requirements will be
+** met: http://www.gnu.org/copyleft/gpl.html.
+**
+** If you are unsure which license is appropriate for your use, please
+** contact the sales department at qt-sales@nokia.com.
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#include "translatortools.h"
+
+#include "simtexth.h"
+#include "translator.h"
+
+#include <QtCore/QDebug>
+#include <QtCore/QMap>
+#include <QtCore/QStringList>
+#include <QtCore/QTextCodec>
+#include <QtCore/QVector>
+
+typedef QList<TranslatorMessage> TML;
+typedef QMap<QString, TranslatorMessage> TMM;
+
+
+QT_BEGIN_NAMESPACE
+
+static bool isDigitFriendly(QChar c)
+{
+ return c.isPunct() || c.isSpace();
+}
+
+static int numberLength(const QString &s, int i)
+{
+ if (i < s.size() || !s.at(i).isDigit())
+ return 0;
+
+ int pos = i;
+ do {
+ ++i;
+ } while (i < s.size()
+ && (s.at(i).isDigit()
+ || (isDigitFriendly(s[i])
+ && i + 1 < s.size()
+ && (s[i + 1].isDigit()
+ || (isDigitFriendly(s[i + 1])
+ && i + 2 < s.size()
+ && s[i + 2].isDigit())))));
+ return i - pos;
+}
+
+
+/*
+ Returns a version of 'key' where all numbers have been replaced by zeroes. If
+ there were none, returns "".
+*/
+static QString zeroKey(const QString &key)
+{
+ QString zeroed;
+ bool metSomething = false;
+
+ for (int i = 0; i != key.size(); ++i) {
+ int len = numberLength(key, i);
+ if (len > 0) {
+ i += len;
+ zeroed.append(QLatin1Char('0'));
+ metSomething = true;
+ } else {
+ zeroed.append(key.at(i));
+ }
+ }
+ return metSomething ? zeroed : QString();
+}
+
+static QString translationAttempt(const QString &oldTranslation,
+ const QString &oldSource, const QString &newSource)
+{
+ int p = zeroKey(oldSource).count(QLatin1Char('0'));
+ QString attempt;
+ QStringList oldNumbers;
+ QStringList newNumbers;
+ QVector<bool> met(p);
+ QVector<int> matchedYet(p);
+ int i, j;
+ int k = 0, ell, best;
+ int m, n;
+ int pass;
+
+ /*
+ This algorithm is hard to follow, so we'll consider an example
+ all along: oldTranslation is "XeT 3.0", oldSource is "TeX 3.0"
+ and newSource is "XeT 3.1".
+
+ First, we set up two tables: oldNumbers and newNumbers. In our
+ example, oldNumber[0] is "3.0" and newNumber[0] is "3.1".
+ */
+ for (i = 0, j = 0; i < oldSource.size(); i++, j++) {
+ m = numberLength(oldSource, i);
+ n = numberLength(newSource, j);
+ if (m > 0) {
+ oldNumbers.append(oldSource.mid(i, m + 1));
+ newNumbers.append(newSource.mid(j, n + 1));
+ i += m;
+ j += n;
+ met[k] = false;
+ matchedYet[k] = 0;
+ k++;
+ }
+ }
+
+ /*
+ We now go over the old translation, "XeT 3.0", one letter at a
+ time, looking for numbers found in oldNumbers. Whenever such a
+ number is met, it is replaced with its newNumber equivalent. In
+ our example, the "3.0" of "XeT 3.0" becomes "3.1".
+ */
+ for (i = 0; i < oldTranslation.length(); i++) {
+ attempt += oldTranslation[i];
+ for (k = 0; k < p; k++) {
+ if (oldTranslation[i] == oldNumbers[k][matchedYet[k]])
+ matchedYet[k]++;
+ else
+ matchedYet[k] = 0;
+ }
+
+ /*
+ Let's find out if the last character ended a match. We make
+ two passes over the data. In the first pass, we try to
+ match only numbers that weren't matched yet; if that fails,
+ the second pass does the trick. This is useful in some
+ suspicious cases, flagged below.
+ */
+ for (pass = 0; pass < 2; pass++) {
+ best = p; // an impossible value
+ for (k = 0; k < p; k++) {
+ if ((!met[k] || pass > 0) &&
+ matchedYet[k] == oldNumbers[k].length() &&
+ numberLength(oldTranslation, i + 1 - matchedYet[k]) == matchedYet[k]) {
+ // the longer the better
+ if (best == p || matchedYet[k] > matchedYet[best])
+ best = k;
+ }
+ }
+ if (best != p) {
+ attempt.truncate(attempt.length() - matchedYet[best]);
+ attempt += newNumbers[best];
+ met[best] = true;
+ for (k = 0; k < p; k++)
+ matchedYet[k] = 0;
+ break;
+ }
+ }
+ }
+
+ /*
+ We flag two kinds of suspicious cases. They are identified as
+ such with comments such as "{2000?}" at the end.
+
+ Example of the first kind: old source text "TeX 3.0" translated
+ as "XeT 2.0" is flagged "TeX 2.0 {3.0?}", no matter what the
+ new text is.
+ */
+ for (k = 0; k < p; k++) {
+ if (!met[k])
+ attempt += QString(QLatin1String(" {")) + newNumbers[k] + QString(QLatin1String("?}"));
+ }
+
+ /*
+ Example of the second kind: "1 of 1" translated as "1 af 1",
+ with new source text "1 of 2", generates "1 af 2 {1 or 2?}"
+ because it's not clear which of "1 af 2" and "2 af 1" is right.
+ */
+ for (k = 0; k < p; k++) {
+ for (ell = 0; ell < p; ell++) {
+ if (k != ell && oldNumbers[k] == oldNumbers[ell] &&
+ newNumbers[k] < newNumbers[ell])
+ attempt += QString(QLatin1String(" {")) + newNumbers[k] + QString(QLatin1String(" or ")) +
+ newNumbers[ell] + QString(QLatin1String("?}"));
+ }
+ }
+ return attempt;
+}
+
+
+/*
+ Augments a Translator with translations easily derived from
+ similar existing (probably obsolete) translations.
+
+ For example, if "TeX 3.0" is translated as "XeT 3.0" and "TeX 3.1"
+ has no translation, "XeT 3.1" is added to the translator and is
+ marked Unfinished.
+
+ Returns the number of additional messages that this heuristic translated.
+*/
+int applyNumberHeuristic(Translator &tor)
+{
+ TMM translated, untranslated;
+ TMM::Iterator t, u;
+ TML all = tor.messages();
+ TML::Iterator it;
+ int inserted = 0;
+
+ for (it = all.begin(); it != all.end(); ++it) {
+ bool hasTranslation = it->isTranslated();
+ if (it->type() == TranslatorMessage::Unfinished) {
+ if (!hasTranslation)
+ untranslated.insert(it->context() + QLatin1Char('\n')
+ + it->sourceText() + QLatin1Char('\n')
+ + it->comment(), *it);
+ } else if (hasTranslation && it->translations().count() == 1) {
+ translated.insert(zeroKey(it->sourceText()), *it);
+ }
+ }
+
+ for (u = untranslated.begin(); u != untranslated.end(); ++u) {
+ t = translated.find(zeroKey((*u).sourceText()));
+ if (t != translated.end() && !t.key().isEmpty()
+ && t->sourceText() != u->sourceText()) {
+ TranslatorMessage m = *u;
+ m.setTranslation(translationAttempt(t->translation(), t->sourceText(),
+ u->sourceText()));
+ tor.replace(m);
+ inserted++;
+ }
+ }
+ return inserted;
+}
+
+
+/*
+ Augments a Translator with trivially derived translations.
+
+ For example, if "Enabled:" is consistendly translated as "Eingeschaltet:" no
+ matter the context or the comment, "Eingeschaltet:" is added as the
+ translation of any untranslated "Enabled:" text and is marked Unfinished.
+
+ Returns the number of additional messages that this heuristic translated.
+*/
+
+int applySameTextHeuristic(Translator &tor)
+{
+ TMM translated;
+ TMM avoid;
+ TMM::Iterator t;
+ TML untranslated;
+ TML::Iterator u;
+ TML all = tor.messages();
+ TML::Iterator it;
+ int inserted = 0;
+
+ for (it = all.begin(); it != all.end(); ++it) {
+ if (!it->isTranslated()) {
+ if (it->type() == TranslatorMessage::Unfinished)
+ untranslated.append(*it);
+ } else {
+ QString key = it->sourceText();
+ t = translated.find(key);
+ if (t != translated.end()) {
+ /*
+ The same source text is translated at least two
+ different ways. Do nothing then.
+ */
+ if (t->translations() != it->translations()) {
+ translated.remove(key);
+ avoid.insert(key, *it);
+ }
+ } else if (!avoid.contains(key)) {
+ translated.insert(key, *it);
+ }
+ }
+ }
+
+ for (u = untranslated.begin(); u != untranslated.end(); ++u) {
+ QString key = u->sourceText();
+ t = translated.find(key);
+ if (t != translated.end()) {
+ TranslatorMessage m = *u;
+ m.setTranslations(t->translations());
+ tor.replace(m);
+ ++inserted;
+ }
+ }
+ return inserted;
+}
+
+
+
+/*
+ Merges two Translator objects. The first one
+ is a set of source texts and translations for a previous version of
+ the internationalized program; the second one is a set of fresh
+ source texts newly extracted from the source code, without any
+ translation yet.
+*/
+
+Translator merge(const Translator &tor, const Translator &virginTor,
+ UpdateOptions options, QString &err)
+{
+ int known = 0;
+ int neww = 0;
+ int obsoleted = 0;
+ int similarTextHeuristicCount = 0;
+
+ Translator outTor;
+ outTor.setLanguageCode(tor.languageCode());
+ outTor.setSourceLanguageCode(tor.sourceLanguageCode());
+ outTor.setLocationsType(tor.locationsType());
+ outTor.setCodecName(tor.codecName());
+
+ /*
+ The types of all the messages from the vernacular translator
+ are updated according to the virgin translator.
+ */
+ foreach (TranslatorMessage m, tor.messages()) {
+ TranslatorMessage::Type newType = TranslatorMessage::Finished;
+
+ if (m.sourceText().isEmpty()) {
+ // context/file comment
+ TranslatorMessage mv = virginTor.find(m.context());
+ if (!mv.isNull())
+ m.setComment(mv.comment());
+ } else {
+ TranslatorMessage mv = virginTor.find(m.context(), m.sourceText(), m.comment());
+ if (mv.isNull()) {
+ if (!(options & HeuristicSimilarText)) {
+ newType = TranslatorMessage::Obsolete;
+ if (m.type() != TranslatorMessage::Obsolete)
+ obsoleted++;
+ m.clearReferences();
+ } else {
+ mv = virginTor.find(m.context(), m.comment(), m.allReferences());
+ if (mv.isNull()) {
+ // did not find it in the virgin, mark it as obsolete
+ newType = TranslatorMessage::Obsolete;
+ if (m.type() != TranslatorMessage::Obsolete)
+ obsoleted++;
+ m.clearReferences();
+ } else {
+ // Do not just accept it if its on the same line number,
+ // but different source text.
+ // Also check if the texts are more or less similar before
+ // we consider them to represent the same message...
+ if (getSimilarityScore(m.sourceText(), mv.sourceText()) >= textSimilarityThreshold) {
+ // It is just slightly modified, assume that it is the same string
+
+ // Mark it as unfinished. (Since the source text
+ // was changed it might require re-translating...)
+ newType = TranslatorMessage::Unfinished;
+ ++similarTextHeuristicCount;
+ neww++;
+
+ m.setOldSourceText(m.sourceText());
+ m.setSourceText(mv.sourceText());
+ const QString &oldpluralsource = m.extra(QLatin1String("po-msgid_plural"));
+ if (!oldpluralsource.isEmpty()) {
+ m.setExtra(QLatin1String("po-old_msgid_plural"), oldpluralsource);
+ m.unsetExtra(QLatin1String("po-msgid_plural"));
+ }
+ m.setReferences(mv.allReferences()); // Update secondary references
+ m.setPlural(mv.isPlural());
+ m.setUtf8(mv.isUtf8());
+ m.setExtraComment(mv.extraComment());
+ } else {
+ // The virgin and vernacular sourceTexts are so
+ // different that we could not find it.
+ newType = TranslatorMessage::Obsolete;
+ if (m.type() != TranslatorMessage::Obsolete)
+ obsoleted++;
+ m.clearReferences();
+ }
+ }
+ }
+ } else {
+ switch (m.type()) {
+ case TranslatorMessage::Finished:
+ default:
+ if (m.isPlural() == mv.isPlural()) {
+ newType = TranslatorMessage::Finished;
+ } else {
+ newType = TranslatorMessage::Unfinished;
+ }
+ known++;
+ break;
+ case TranslatorMessage::Unfinished:
+ newType = TranslatorMessage::Unfinished;
+ known++;
+ break;
+ case TranslatorMessage::Obsolete:
+ newType = TranslatorMessage::Unfinished;
+ neww++;
+ }
+
+ // Always get the filename and linenumber info from the
+ // virgin Translator, in case it has changed location.
+ // This should also enable us to read a file that does not
+ // have the <location> element.
+ // why not use operator=()? Because it overwrites e.g. userData.
+ m.setReferences(mv.allReferences());
+ m.setPlural(mv.isPlural());
+ m.setUtf8(mv.isUtf8());
+ m.setExtraComment(mv.extraComment());
+ }
+ }
+
+ m.setType(newType);
+ outTor.append(m);
+ }
+
+ /*
+ Messages found only in the virgin translator are added to the
+ vernacular translator.
+ */
+ foreach (const TranslatorMessage &mv, virginTor.messages()) {
+ if (mv.sourceText().isEmpty()) {
+ if (tor.contains(mv.context()))
+ continue;
+ } else {
+ if (tor.contains(mv.context(), mv.sourceText(), mv.comment()))
+ continue;
+ if (options & HeuristicSimilarText) {
+ TranslatorMessage m = tor.find(mv.context(), mv.comment(), mv.allReferences());
+ if (!m.isNull()) {
+ if (getSimilarityScore(m.sourceText(), mv.sourceText()) >= textSimilarityThreshold)
+ continue;
+ }
+ }
+ }
+ if (options & NoLocations)
+ outTor.append(mv);
+ else
+ outTor.appendSorted(mv);
+ if (!mv.sourceText().isEmpty())
+ ++neww;
+ }
+
+ /*
+ The same-text heuristic handles cases where a message has an
+ obsolete counterpart with a different context or comment.
+ */
+ int sameTextHeuristicCount = (options & HeuristicSameText) ? applySameTextHeuristic(outTor) : 0;
+
+ /*
+ The number heuristic handles cases where a message has an
+ obsolete counterpart with mostly numbers differing in the
+ source text.
+ */
+ int sameNumberHeuristicCount = (options & HeuristicNumber) ? applyNumberHeuristic(outTor) : 0;
+
+ if (options & Verbose) {
+ int totalFound = neww + known;
+ err += QObject::tr(" Found %n source text(s) (%1 new and %2 already existing)\n", 0, totalFound).arg(neww).arg(known);
+
+ if (obsoleted) {
+ if (options & NoObsolete) {
+ err += QObject::tr(" Removed %n obsolete entries\n", 0, obsoleted);
+ } else {
+ err += QObject::tr(" Kept %n obsolete entries\n", 0, obsoleted);
+ }
+ }
+
+ if (sameNumberHeuristicCount)
+ err += QObject::tr(" Number heuristic provided %n translation(s)\n",
+ 0, sameNumberHeuristicCount);
+ if (sameTextHeuristicCount)
+ err += QObject::tr(" Same-text heuristic provided %n translation(s)\n",
+ 0, sameTextHeuristicCount);
+ if (similarTextHeuristicCount)
+ err += QObject::tr(" Similar-text heuristic provided %n translation(s)\n",
+ 0, similarTextHeuristicCount);
+ }
+ return outTor;
+}
+
+QT_END_NAMESPACE