UTF-8 text codec should be able to convert data when fed one by one byte.

When the input data is fed to utf-8 by one byte it couldn't parse the BOM correctly. So we wait until the BOM is composed into a code point and check it afterwards. Reviewed-by: Olivier Goffart
author: Denis Dzyubenko <denis.dzyubenko@nokia.com> 2009-06-11 12:59:23 (GMT)
committer: Denis Dzyubenko <denis.dzyubenko@nokia.com> 2009-06-11 19:04:15 (GMT)
commit: f6aa5d8cfbec4f4ffacf20a94a1653c1a8ee2134 (patch)
tree: 2ac49d8f20eb83067932bb655939d6bb044a0fb0
parent: 52392292c8fa096e3b0bb692dedce66924ab3305 (diff)
download: Qt-f6aa5d8cfbec4f4ffacf20a94a1653c1a8ee2134.zip
Qt-f6aa5d8cfbec4f4ffacf20a94a1653c1a8ee2134.tar.gz
Qt-f6aa5d8cfbec4f4ffacf20a94a1653c1a8ee2134.tar.bz2
3 files changed, 77 insertions, 1 deletions
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp
index 6611315..27c0572 100644
--- a/src/corelib/codecs/qutfcodec.cpp
+++ b/src/corelib/codecs/qutfcodec.cpp
@@ -184,7 +184,10 @@ void QUtf8Codec::convertToUnicode(QString *target, const char *chars, int len, C
                 uc = (uc << 6) | (ch & 0x3f);
                 need--;
                 if (!need) {
-                    if (uc > 0xffff && uc < 0x110000) {
+                    // utf-8 bom composes into 0xfeff code point
+                    if (!headerdone && uc == 0xfeff) {
+                        // dont do anything, just skip the BOM
+                    } else if (uc > 0xffff && uc < 0x110000) {
                         // surrogate pair
                         uc -= 0x10000;
                         unsigned short high = uc/0x400 + 0xd800;
@@ -206,6 +209,7 @@ void QUtf8Codec::convertToUnicode(QString *target, const char *chars, int len, C
                     } else {
                         *qch++ = uc;
                     }
+                    headerdone = true;
                 }
             } else {
                 // error
@@ -213,15 +217,18 @@ void QUtf8Codec::convertToUnicode(QString *target, const char *chars, int len, C
                 *qch++ = replacement;
                 ++invalid;
                 need = 0;
+                headerdone = true;
             }
         } else {
             if (ch < 128) {
                 *qch++ = QLatin1Char(ch);
+                headerdone = true;
             } else if ((ch & 0xe0) == 0xc0) {
                 uc = ch & 0x1f;
                 need = 1;
                 error = i;
                 min_uc = 0x80;
+                headerdone = true;
             } else if ((ch & 0xf0) == 0xe0) {
                 uc = ch & 0x0f;
                 need = 2;
@@ -232,10 +239,12 @@ void QUtf8Codec::convertToUnicode(QString *target, const char *chars, int len, C
                 need = 3;
                 error = i;
                 min_uc = 0x10000;
+                headerdone = true;
             } else {
                 // error
                 *qch++ = replacement;
                 ++invalid;
+                headerdone = true;
             }
         }
     }
diff --git a/tests/auto/qtextcodec/tst_qtextcodec.cpp b/tests/auto/qtextcodec/tst_qtextcodec.cpp
index 97c409b..88dbaf7 100644
--- a/tests/auto/qtextcodec/tst_qtextcodec.cpp
+++ b/tests/auto/qtextcodec/tst_qtextcodec.cpp
@@ -74,6 +74,9 @@ private slots:
     void utf8Codec_data();
     void utf8Codec();
 
+    void utf8bom_data();
+    void utf8bom();
+
     void utfHeaders_data();
     void utfHeaders();
 
@@ -1513,6 +1516,63 @@ void tst_QTextCodec::utf8Codec()
     QCOMPARE(str, res);
 }
 
+void tst_QTextCodec::utf8bom_data()
+{
+    QTest::addColumn<QByteArray>("data");
+    QTest::addColumn<QString>("result");
+
+    QTest::newRow("nobom")
+        << QByteArray("\302\240", 2)
+        << QString("\240");
+
+    {
+        static const ushort data[] = { 0x201d };
+        QTest::newRow("nobom 2")
+            << QByteArray("\342\200\235", 3)
+            << QString::fromUtf16(data, sizeof(data)/sizeof(short));
+    }
+
+    {
+        static const ushort data[] = { 0xf000 };
+        QTest::newRow("bom1")
+            << QByteArray("\357\200\200", 3)
+            << QString::fromUtf16(data, sizeof(data)/sizeof(short));
+    }
+
+    {
+        static const ushort data[] = { 0xfec0 };
+        QTest::newRow("bom2")
+            << QByteArray("\357\273\200", 3)
+            << QString::fromUtf16(data, sizeof(data)/sizeof(short));
+    }
+
+    {
+        QTest::newRow("normal-bom")
+            << QByteArray("\357\273\277a", 4)
+            << QString("a");
+    }
+
+    {
+        static const ushort data[] = { 0x61, 0xfeff, 0x62 };
+        QTest::newRow("middle-bom")
+            << QByteArray("a\357\273\277b", 5)
+            << QString::fromUtf16(data, sizeof(data)/sizeof(short));
+    }
+}
+
+void tst_QTextCodec::utf8bom()
+{
+    QFETCH(QByteArray, data);
+    QFETCH(QString, result);
+
+    QTextCodec *const codec = QTextCodec::codecForMib(106); // UTF-8
+    Q_ASSERT(codec);
+
+    QCOMPARE(codec->toUnicode(data.constData(), data.length(), 0), result);
+
+    QTextCodec::ConverterState state;
+    QCOMPARE(codec->toUnicode(data.constData(), data.length(), &state), result);
+}
 
 void tst_QTextCodec::utfHeaders_data()
 {
diff --git a/tests/auto/qtextstream/tst_qtextstream.cpp b/tests/auto/qtextstream/tst_qtextstream.cpp
index cf495d5..9573957 100644
--- a/tests/auto/qtextstream/tst_qtextstream.cpp
+++ b/tests/auto/qtextstream/tst_qtextstream.cpp
@@ -553,6 +553,13 @@ void tst_QTextStream::generateLineData(bool for_QString)
     QTest::newRow("threelines/crlf/crlf/nothing") << QByteArray("ole\r\ndole\r\ndoffen") << (QStringList() << "ole" << "dole" << "doffen");
 
     if (!for_QString) {
+        // utf-8
+        QTest::newRow("utf8/twolines")
+            << QByteArray("\xef\xbb\xbf"
+                          "\x66\x67\x65\x0a"
+                          "\x66\x67\x65\x0a", 11)
+            << (QStringList() << "fge" << "fge");
+
         // utf-16
         // one line
         QTest::newRow("utf16-BE/nothing")
author	Denis Dzyubenko <denis.dzyubenko@nokia.com>	2009-06-11 12:59:23 (GMT)
committer	Denis Dzyubenko <denis.dzyubenko@nokia.com>	2009-06-11 19:04:15 (GMT)
commit	f6aa5d8cfbec4f4ffacf20a94a1653c1a8ee2134 (patch)
tree	2ac49d8f20eb83067932bb655939d6bb044a0fb0
parent	52392292c8fa096e3b0bb692dedce66924ab3305 (diff)
download	Qt-f6aa5d8cfbec4f4ffacf20a94a1653c1a8ee2134.zip Qt-f6aa5d8cfbec4f4ffacf20a94a1653c1a8ee2134.tar.gz Qt-f6aa5d8cfbec4f4ffacf20a94a1653c1a8ee2134.tar.bz2