summaryrefslogtreecommitdiffstats
path: root/tests
diff options
context:
space:
mode:
authorThiago Macieira <thiago.macieira@nokia.com>2011-03-19 13:09:41 (GMT)
committerThiago Macieira <thiago.macieira@nokia.com>2011-03-22 14:51:56 (GMT)
commitdfadfaebe1e8a99230d511645bb1fa24bdf9e033 (patch)
treec8e759dd116ec2580341581d285ec65593bece55 /tests
parent86ee899d3d01463c55ee9ba753ee3d47f87ad07d (diff)
downloadQt-dfadfaebe1e8a99230d511645bb1fa24bdf9e033.zip
Qt-dfadfaebe1e8a99230d511645bb1fa24bdf9e033.tar.gz
Qt-dfadfaebe1e8a99230d511645bb1fa24bdf9e033.tar.bz2
Add UTF-8 code benchmarks
Also compare to the Latin1 functions. The UTF-8 algorithm in Qt 4.7 right now is 109% slower than the unoptimised Latin-1 algo, 120% than the Qt 4.7 SSE2 code
Diffstat (limited to 'tests')
-rw-r--r--tests/benchmarks/corelib/tools/qstring/main.cpp223
1 files changed, 223 insertions, 0 deletions
diff --git a/tests/benchmarks/corelib/tools/qstring/main.cpp b/tests/benchmarks/corelib/tools/qstring/main.cpp
index f2d6de7..f505151 100644
--- a/tests/benchmarks/corelib/tools/qstring/main.cpp
+++ b/tests/benchmarks/corelib/tools/qstring/main.cpp
@@ -74,6 +74,8 @@ private slots:
void fromLatin1() const;
void fromLatin1Alternatives_data() const;
void fromLatin1Alternatives() const;
+ void fromUtf8Alternatives_data() const;
+ void fromUtf8Alternatives() const;
};
void tst_QString::equals() const
@@ -1706,6 +1708,227 @@ void tst_QString::fromLatin1Alternatives() const
}
}
+typedef int (* FromUtf8Function)(ushort *, const char *, int);
+Q_DECLARE_METATYPE(FromUtf8Function)
+
+extern QTextCodec::ConverterState *state;
+QTextCodec::ConverterState *state = 0; // just because the code in qutfcodec.cpp uses a state
+
+int fromUtf8_latin1_regular(ushort *dst, const char *chars, int len)
+{
+ fromLatin1_regular(dst, chars, len);
+ return len;
+}
+
+int fromUtf8_latin1_qt47(ushort *dst, const char *chars, int len)
+{
+ fromLatin1_sse2_qt47(dst, chars, len);
+ return len;
+}
+
+int fromUtf8_latin1best(ushort *dst, const char *chars, int len)
+{
+ fromLatin1_sse2_improved(dst, chars, len);
+ return len;
+}
+
+static inline bool isUnicodeNonCharacter(uint ucs4)
+{
+ // Unicode has a couple of "non-characters" that one can use internally,
+ // but are not allowed to be used for text interchange.
+ //
+ // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF,
+ // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and
+ // U+FDEF (inclusive)
+
+ return (ucs4 & 0xfffe) == 0xfffe
+ || (ucs4 - 0xfdd0U) < 16;
+}
+
+int fromUtf8_qt47(ushort *dst, const char *chars, int len)
+{
+ // this is almost the code found in Qt 4.7's qutfcodec.cpp QUtf8Codec::convertToUnicode
+ // That function returns a QString, this one returns the number of characters converted
+ // That's to avoid doing malloc() inside the benchmark test
+ // Any differences between this code and the original are just because of that, I promise
+
+ bool headerdone = false;
+ ushort replacement = QChar::ReplacementCharacter;
+ int need = 0;
+ int error = -1;
+ uint uc = 0;
+ uint min_uc = 0;
+ if (state) {
+ if (state->flags & QTextCodec::IgnoreHeader)
+ headerdone = true;
+ if (state->flags & QTextCodec::ConvertInvalidToNull)
+ replacement = QChar::Null;
+ need = state->remainingChars;
+ if (need) {
+ uc = state->state_data[0];
+ min_uc = state->state_data[1];
+ }
+ }
+ if (!headerdone && len > 3
+ && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) {
+ // starts with a byte order mark
+ chars += 3;
+ len -= 3;
+ headerdone = true;
+ }
+
+ // QString result(need + len + 1, Qt::Uninitialized); // worst case
+ // ushort *qch = (ushort *)result.unicode();
+ ushort *qch = dst;
+ uchar ch;
+ int invalid = 0;
+
+ for (int i = 0; i < len; ++i) {
+ ch = chars[i];
+ if (need) {
+ if ((ch&0xc0) == 0x80) {
+ uc = (uc << 6) | (ch & 0x3f);
+ --need;
+ if (!need) {
+ // utf-8 bom composes into 0xfeff code point
+ bool nonCharacter;
+ if (!headerdone && uc == 0xfeff) {
+ // don't do anything, just skip the BOM
+ } else if (!(nonCharacter = isUnicodeNonCharacter(uc)) && uc > 0xffff && uc < 0x110000) {
+ // surrogate pair
+ //Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
+ *qch++ = QChar::highSurrogate(uc);
+ *qch++ = QChar::lowSurrogate(uc);
+ } else if ((uc < min_uc) || (uc >= 0xd800 && uc <= 0xdfff) || nonCharacter || uc >= 0x110000) {
+ // error: overlong sequence, UTF16 surrogate or non-character
+ *qch++ = replacement;
+ ++invalid;
+ } else {
+ *qch++ = uc;
+ }
+ headerdone = true;
+ }
+ } else {
+ // error
+ i = error;
+ *qch++ = replacement;
+ ++invalid;
+ need = 0;
+ headerdone = true;
+ }
+ } else {
+ if (ch < 128) {
+ *qch++ = ushort(ch);
+ headerdone = true;
+ } else if ((ch & 0xe0) == 0xc0) {
+ uc = ch & 0x1f;
+ need = 1;
+ error = i;
+ min_uc = 0x80;
+ headerdone = true;
+ } else if ((ch & 0xf0) == 0xe0) {
+ uc = ch & 0x0f;
+ need = 2;
+ error = i;
+ min_uc = 0x800;
+ } else if ((ch&0xf8) == 0xf0) {
+ uc = ch & 0x07;
+ need = 3;
+ error = i;
+ min_uc = 0x10000;
+ headerdone = true;
+ } else {
+ // error
+ *qch++ = replacement;
+ ++invalid;
+ headerdone = true;
+ }
+ }
+ }
+ if (!state && need > 0) {
+ // unterminated UTF sequence
+ for (int i = error; i < len; ++i) {
+ *qch++ = replacement;
+ ++invalid;
+ }
+ }
+ //result.truncate(qch - (ushort *)result.unicode());
+ if (state) {
+ state->invalidChars += invalid;
+ state->remainingChars = need;
+ if (headerdone)
+ state->flags |= QTextCodec::IgnoreHeader;
+ state->state_data[0] = need ? uc : 0;
+ state->state_data[1] = need ? min_uc : 0;
+ }
+ //return result;
+ return qch - dst;
+}
+
+void tst_QString::fromUtf8Alternatives_data() const
+{
+ QTest::addColumn<FromUtf8Function>("function");
+ QTest::newRow("latin1-regular") << &fromUtf8_latin1_regular;
+ QTest::newRow("latin1-best") << &fromUtf8_latin1best;
+ QTest::newRow("latin1-qt4.7") << &fromUtf8_latin1_qt47;
+ QTest::newRow("qt-4.7") << &fromUtf8_qt47;
+}
+
+extern StringData fromUtf8Data;
+static void fromUtf8Alternatives_internal(FromUtf8Function function, QString &dst, bool doVerify)
+{
+ if (!doVerify) {
+ // NOTE: this only works because the Latin1 data is ASCII-only
+ fromLatin1Alternatives_internal(reinterpret_cast<FromLatin1Function>(function), dst, doVerify);
+ } else {
+ if (strncmp(QTest::currentDataTag(), "latin1-", 7) == 0)
+ return;
+ }
+
+ struct Entry
+ {
+ int len;
+ int offset1, offset2;
+ int align1, align2;
+ };
+ const Entry *entries = reinterpret_cast<const Entry *>(fromUtf8Data.entries);
+
+ for (int i = 0; i < fromUtf8Data.entryCount; ++i) {
+ int len = entries[i].len;
+ const char *src = fromUtf8Data.charData + entries[i].offset1;
+
+ if (!doVerify) {
+ (function)(&dst.data()->unicode(), src, len);
+ } else {
+ dst.fill(QChar('x'), dst.length());
+
+ int utf8len = (function)(&dst.data()->unicode() + 8, src, len);
+
+ QString expected = QString::fromUtf8(src, len);
+ QCOMPARE(utf8len, expected.length());
+
+ QString final = dst.mid(8, utf8len);
+ QCOMPARE(final, expected);
+
+ QString zeroes(8, QChar('x'));
+ QCOMPARE(dst.left(8), zeroes);
+ QCOMPARE(dst.mid(len + 8, 8), zeroes);
+ }
+ }
+}
+
+void tst_QString::fromUtf8Alternatives() const
+{
+ QFETCH(FromUtf8Function, function);
+
+ QString dst(fromUtf8Data.maxLength + 16, QChar('x'));
+ fromUtf8Alternatives_internal(function, dst, true);
+
+ QBENCHMARK {
+ fromUtf8Alternatives_internal(function, dst, false);
+ }
+}
+
QTEST_MAIN(tst_QString)
#include "main.moc"