diff options
-rw-r--r-- | src/corelib/codecs/qutfcodec.cpp | 11 | ||||
-rw-r--r-- | tests/auto/qtextcodec/tst_qtextcodec.cpp | 60 | ||||
-rw-r--r-- | tests/auto/qtextstream/tst_qtextstream.cpp | 7 |
3 files changed, 77 insertions, 1 deletions
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp index 6611315..27c0572 100644 --- a/src/corelib/codecs/qutfcodec.cpp +++ b/src/corelib/codecs/qutfcodec.cpp @@ -184,7 +184,10 @@ void QUtf8Codec::convertToUnicode(QString *target, const char *chars, int len, C uc = (uc << 6) | (ch & 0x3f); need--; if (!need) { - if (uc > 0xffff && uc < 0x110000) { + // utf-8 bom composes into 0xfeff code point + if (!headerdone && uc == 0xfeff) { + // dont do anything, just skip the BOM + } else if (uc > 0xffff && uc < 0x110000) { // surrogate pair uc -= 0x10000; unsigned short high = uc/0x400 + 0xd800; @@ -206,6 +209,7 @@ void QUtf8Codec::convertToUnicode(QString *target, const char *chars, int len, C } else { *qch++ = uc; } + headerdone = true; } } else { // error @@ -213,15 +217,18 @@ void QUtf8Codec::convertToUnicode(QString *target, const char *chars, int len, C *qch++ = replacement; ++invalid; need = 0; + headerdone = true; } } else { if (ch < 128) { *qch++ = QLatin1Char(ch); + headerdone = true; } else if ((ch & 0xe0) == 0xc0) { uc = ch & 0x1f; need = 1; error = i; min_uc = 0x80; + headerdone = true; } else if ((ch & 0xf0) == 0xe0) { uc = ch & 0x0f; need = 2; @@ -232,10 +239,12 @@ void QUtf8Codec::convertToUnicode(QString *target, const char *chars, int len, C need = 3; error = i; min_uc = 0x10000; + headerdone = true; } else { // error *qch++ = replacement; ++invalid; + headerdone = true; } } } diff --git a/tests/auto/qtextcodec/tst_qtextcodec.cpp b/tests/auto/qtextcodec/tst_qtextcodec.cpp index 97c409b..88dbaf7 100644 --- a/tests/auto/qtextcodec/tst_qtextcodec.cpp +++ b/tests/auto/qtextcodec/tst_qtextcodec.cpp @@ -74,6 +74,9 @@ private slots: void utf8Codec_data(); void utf8Codec(); + void utf8bom_data(); + void utf8bom(); + void utfHeaders_data(); void utfHeaders(); @@ -1513,6 +1516,63 @@ void tst_QTextCodec::utf8Codec() QCOMPARE(str, res); } +void tst_QTextCodec::utf8bom_data() +{ + QTest::addColumn<QByteArray>("data"); + QTest::addColumn<QString>("result"); + + QTest::newRow("nobom") + << QByteArray("\302\240", 2) + << QString("\240"); + + { + static const ushort data[] = { 0x201d }; + QTest::newRow("nobom 2") + << QByteArray("\342\200\235", 3) + << QString::fromUtf16(data, sizeof(data)/sizeof(short)); + } + + { + static const ushort data[] = { 0xf000 }; + QTest::newRow("bom1") + << QByteArray("\357\200\200", 3) + << QString::fromUtf16(data, sizeof(data)/sizeof(short)); + } + + { + static const ushort data[] = { 0xfec0 }; + QTest::newRow("bom2") + << QByteArray("\357\273\200", 3) + << QString::fromUtf16(data, sizeof(data)/sizeof(short)); + } + + { + QTest::newRow("normal-bom") + << QByteArray("\357\273\277a", 4) + << QString("a"); + } + + { + static const ushort data[] = { 0x61, 0xfeff, 0x62 }; + QTest::newRow("middle-bom") + << QByteArray("a\357\273\277b", 5) + << QString::fromUtf16(data, sizeof(data)/sizeof(short)); + } +} + +void tst_QTextCodec::utf8bom() +{ + QFETCH(QByteArray, data); + QFETCH(QString, result); + + QTextCodec *const codec = QTextCodec::codecForMib(106); // UTF-8 + Q_ASSERT(codec); + + QCOMPARE(codec->toUnicode(data.constData(), data.length(), 0), result); + + QTextCodec::ConverterState state; + QCOMPARE(codec->toUnicode(data.constData(), data.length(), &state), result); +} void tst_QTextCodec::utfHeaders_data() { diff --git a/tests/auto/qtextstream/tst_qtextstream.cpp b/tests/auto/qtextstream/tst_qtextstream.cpp index cf495d5..9573957 100644 --- a/tests/auto/qtextstream/tst_qtextstream.cpp +++ b/tests/auto/qtextstream/tst_qtextstream.cpp @@ -553,6 +553,13 @@ void tst_QTextStream::generateLineData(bool for_QString) QTest::newRow("threelines/crlf/crlf/nothing") << QByteArray("ole\r\ndole\r\ndoffen") << (QStringList() << "ole" << "dole" << "doffen"); if (!for_QString) { + // utf-8 + QTest::newRow("utf8/twolines") + << QByteArray("\xef\xbb\xbf" + "\x66\x67\x65\x0a" + "\x66\x67\x65\x0a", 11) + << (QStringList() << "fge" << "fge"); + // utf-16 // one line QTest::newRow("utf16-BE/nothing") |