diff options
author | Thiago Macieira <thiago.macieira@nokia.com> | 2011-03-19 13:24:34 (GMT) |
---|---|---|
committer | Thiago Macieira <thiago.macieira@nokia.com> | 2011-03-22 14:51:56 (GMT) |
commit | 6934b9b7be067d1c61c8796f8a49d40b3b206a3e (patch) | |
tree | b95bf141cea4f79614433eabed42f06084fd8628 /tests | |
parent | dfadfaebe1e8a99230d511645bb1fa24bdf9e033 (diff) | |
download | Qt-6934b9b7be067d1c61c8796f8a49d40b3b206a3e.zip Qt-6934b9b7be067d1c61c8796f8a49d40b3b206a3e.tar.gz Qt-6934b9b7be067d1c61c8796f8a49d40b3b206a3e.tar.bz2 |
Add a stateless copy of the Qt 4.7 UTF-8 codec.
For whatever reason, this code is worse than the stateful code...
Diffstat (limited to 'tests')
-rw-r--r-- | tests/benchmarks/corelib/tools/qstring/main.cpp | 99 |
1 files changed, 99 insertions, 0 deletions
diff --git a/tests/benchmarks/corelib/tools/qstring/main.cpp b/tests/benchmarks/corelib/tools/qstring/main.cpp index f505151..c7a5096 100644 --- a/tests/benchmarks/corelib/tools/qstring/main.cpp +++ b/tests/benchmarks/corelib/tools/qstring/main.cpp @@ -1865,6 +1865,104 @@ int fromUtf8_qt47(ushort *dst, const char *chars, int len) return qch - dst; } +int fromUtf8_qt47_stateless(ushort *dst, const char *chars, int len) +{ + // This is the same code as above, but for stateless UTF-8 conversion + // no other improvements + bool headerdone = false; + const ushort replacement = QChar::ReplacementCharacter; + int need = 0; + int error = -1; + uint uc = 0; + uint min_uc = 0; + + if (len > 3 + && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) { + // starts with a byte order mark + chars += 3; + len -= 3; + } + + // QString result(need + len + 1, Qt::Uninitialized); // worst case + // ushort *qch = (ushort *)result.unicode(); + ushort *qch = dst; + uchar ch; + int invalid = 0; + + for (int i = 0; i < len; ++i) { + ch = chars[i]; + if (need) { + if ((ch&0xc0) == 0x80) { + uc = (uc << 6) | (ch & 0x3f); + --need; + if (!need) { + // utf-8 bom composes into 0xfeff code point + bool nonCharacter; + if (!headerdone && uc == 0xfeff) { + // don't do anything, just skip the BOM + } else if (!(nonCharacter = isUnicodeNonCharacter(uc)) && uc > 0xffff && uc < 0x110000) { + // surrogate pair + //Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length()); + *qch++ = QChar::highSurrogate(uc); + *qch++ = QChar::lowSurrogate(uc); + } else if ((uc < min_uc) || (uc >= 0xd800 && uc <= 0xdfff) || nonCharacter || uc >= 0x110000) { + // error: overlong sequence, UTF16 surrogate or non-character + *qch++ = replacement; + ++invalid; + } else { + *qch++ = uc; + } + headerdone = true; + } + } else { + // error + i = error; + *qch++ = replacement; + ++invalid; + need = 0; + headerdone = true; + } + } else { + if (ch < 128) { + *qch++ = ushort(ch); + headerdone = true; + } else if ((ch & 0xe0) == 0xc0) { + uc = ch & 0x1f; + need = 1; + error = i; + min_uc = 0x80; + headerdone = true; + } else if ((ch & 0xf0) == 0xe0) { + uc = ch & 0x0f; + need = 2; + error = i; + min_uc = 0x800; + } else if ((ch&0xf8) == 0xf0) { + uc = ch & 0x07; + need = 3; + error = i; + min_uc = 0x10000; + headerdone = true; + } else { + // error + *qch++ = replacement; + ++invalid; + headerdone = true; + } + } + } + if (need > 0) { + // unterminated UTF sequence + for (int i = error; i < len; ++i) { + *qch++ = replacement; + ++invalid; + } + } + //result.truncate(qch - (ushort *)result.unicode()); + //return result; + return qch - dst; +} + void tst_QString::fromUtf8Alternatives_data() const { QTest::addColumn<FromUtf8Function>("function"); @@ -1872,6 +1970,7 @@ void tst_QString::fromUtf8Alternatives_data() const QTest::newRow("latin1-best") << &fromUtf8_latin1best; QTest::newRow("latin1-qt4.7") << &fromUtf8_latin1_qt47; QTest::newRow("qt-4.7") << &fromUtf8_qt47; + QTest::newRow("qt-4.7-stateless") << &fromUtf8_qt47_stateless; } extern StringData fromUtf8Data; |