From 7031e1d110bb1bc97cfe0377adc211030e1e7320 Mon Sep 17 00:00:00 2001 From: Denis Dzyubenko Date: Tue, 28 Apr 2009 14:08:59 +0200 Subject: When data was copied from Mozilla Firefox to Qt, the text format was not valid. Mozilla encodes the text/html format in UTF16 and adds a BOM, however it doesn't specify the charset in the html header. The fix is to guess the encoding by either charset in the html header or BOM for text/html format, or by BOM for non html formats. This commit adds a new public function QTextCodec::codecForUtfText() which can be used to guess encoding out of the BOM. Task-number: 250555 Reviewed-by: Benjamin Poulain Reviewed-by: Simon Hausmann Reviewed-by: Andreas Aardal Hanssen --- src/corelib/codecs/qtextcodec.cpp | 77 +++++++++++++++++++++++++++----- src/corelib/codecs/qtextcodec.h | 3 ++ src/gui/kernel/qclipboard.cpp | 24 +++++++--- tests/auto/qtextcodec/tst_qtextcodec.cpp | 59 ++++++++++++++++++++++++ 4 files changed, 144 insertions(+), 19 deletions(-) diff --git a/src/corelib/codecs/qtextcodec.cpp b/src/corelib/codecs/qtextcodec.cpp index 6e8ffa1..51ca43e 100644 --- a/src/corelib/codecs/qtextcodec.cpp +++ b/src/corelib/codecs/qtextcodec.cpp @@ -1508,9 +1508,14 @@ QString QTextDecoder::toUnicode(const QByteArray &ba) /*! \since 4.4 - Tries to detect the encoding of the provided snippet of HTML in the given byte array, \a ba, - and returns a QTextCodec instance that is capable of decoding the html to unicode. - If the codec cannot be detected from the content provided, \a defaultCodec is returned. + Tries to detect the encoding of the provided snippet of HTML in + the given byte array, \a ba, by checking the BOM (Byte Order Mark) + and the content-type meta header and returns a QTextCodec instance + that is capable of decoding the html to unicode. If the codec + cannot be detected from the content provided, \a defaultCodec is + returned. + + \sa codecForUtfText */ QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec) { @@ -1518,15 +1523,8 @@ QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCo int pos; QTextCodec *c = 0; - if (ba.size() > 1 && (((uchar)ba[0] == 0xfe && (uchar)ba[1] == 0xff) - || ((uchar)ba[0] == 0xff && (uchar)ba[1] == 0xfe))) { - c = QTextCodec::codecForMib(1015); // utf16 - } else if (ba.size() > 2 - && (uchar)ba[0] == 0xef - && (uchar)ba[1] == 0xbb - && (uchar)ba[2] == 0xbf) { - c = QTextCodec::codecForMib(106); // utf-8 - } else { + c = QTextCodec::codecForUtfText(ba, c); + if (!c) { QByteArray header = ba.left(512).toLower(); if ((pos = header.indexOf("http-equiv=")) != -1) { pos = header.indexOf("charset=", pos) + int(strlen("charset=")); @@ -1554,6 +1552,61 @@ QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba) return codecForHtml(ba, QTextCodec::codecForMib(/*Latin 1*/ 4)); } +/*! + \since 4.6 + + Tries to detect the encoding of the provided snippet \a ba by + using the BOM (Byte Order Mark) and returns a QTextCodec instance + that is capable of decoding the text to unicode. If the codec + cannot be detected from the content provided, \a defaultCodec is + returned. + + \sa codecForHtml +*/ +QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba, QTextCodec *defaultCodec) +{ + const uint arraySize = ba.size(); + + if (arraySize > 3) { + if ((uchar)ba[0] == 0x00 + && (uchar)ba[1] == 0x00 + && (uchar)ba[2] == 0xFE + && (uchar)ba[3] == 0xFF) + return QTextCodec::codecForMib(1018); // utf-32 be + else if ((uchar)ba[0] == 0xFF + && (uchar)ba[1] == 0xFE + && (uchar)ba[2] == 0x00 + && (uchar)ba[3] == 0x00) + return QTextCodec::codecForMib(1019); // utf-32 le + } + + if (arraySize < 2) + return defaultCodec; + if ((uchar)ba[0] == 0xfe && (uchar)ba[1] == 0xff) + return QTextCodec::codecForMib(1013); // utf16 be + else if ((uchar)ba[0] == 0xff && (uchar)ba[1] == 0xfe) + return QTextCodec::codecForMib(1014); // utf16 le + + if (arraySize < 3) + return defaultCodec; + if ((uchar)ba[0] == 0xef + && (uchar)ba[1] == 0xbb + && (uchar)ba[2] == 0xbf) + return QTextCodec::codecForMib(106); // utf-8 + + return defaultCodec; +} + +/*! + \overload + + If the codec cannot be detected, this overload returns a Latin-1 QTextCodec. +*/ +QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba) +{ + return codecForUtfText(ba, QTextCodec::codecForMib(/*Latin 1*/ 4)); +} + /*! \internal \since 4.3 diff --git a/src/corelib/codecs/qtextcodec.h b/src/corelib/codecs/qtextcodec.h index e32650f..83097a5 100644 --- a/src/corelib/codecs/qtextcodec.h +++ b/src/corelib/codecs/qtextcodec.h @@ -82,6 +82,9 @@ public: static QTextCodec *codecForHtml(const QByteArray &ba); static QTextCodec *codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec); + static QTextCodec *codecForUtfText(const QByteArray &ba); + static QTextCodec *codecForUtfText(const QByteArray &ba, QTextCodec *defaultCodec); + QTextDecoder* makeDecoder() const; QTextEncoder* makeEncoder() const; diff --git a/src/gui/kernel/qclipboard.cpp b/src/gui/kernel/qclipboard.cpp index 917b5d5..6daf433 100644 --- a/src/gui/kernel/qclipboard.cpp +++ b/src/gui/kernel/qclipboard.cpp @@ -50,6 +50,7 @@ #include "qvariant.h" #include "qbuffer.h" #include "qimage.h" +#include "qtextcodec.h" QT_BEGIN_NAMESPACE @@ -276,11 +277,12 @@ QClipboard::~QClipboard() */ QString QClipboard::text(QString &subtype, Mode mode) const { - const QMimeData *data = mimeData(mode); + const QMimeData *const data = mimeData(mode); if (!data) return QString(); + + const QStringList formats = data->formats(); if (subtype.isEmpty()) { - QStringList formats = data->formats(); if (formats.contains(QLatin1String("text/plain"))) subtype = QLatin1String("plain"); else { @@ -289,13 +291,21 @@ QString QClipboard::text(QString &subtype, Mode mode) const subtype = formats.at(i).mid(5); break; } + if (subtype.isEmpty()) + return QString(); } - } - if (subtype.isEmpty()) + } else if (!formats.contains(QLatin1String("text/") + subtype)) { return QString(); - if (subtype == QLatin1String("plain")) - return data->text(); - return QString::fromUtf8(data->data(QLatin1String("text/") + subtype)); + } + + const QByteArray rawData = data->data(QLatin1String("text/") + subtype); + + QTextCodec* codec = QTextCodec::codecForMib(106); // utf-8 is default + if (subtype == QLatin1String("html")) + codec = QTextCodec::codecForHtml(rawData, codec); + else + codec = QTextCodec::codecForUtfText(rawData, codec); + return codec->toUnicode(rawData); } /*! diff --git a/tests/auto/qtextcodec/tst_qtextcodec.cpp b/tests/auto/qtextcodec/tst_qtextcodec.cpp index cf4135b..22f9557 100644 --- a/tests/auto/qtextcodec/tst_qtextcodec.cpp +++ b/tests/auto/qtextcodec/tst_qtextcodec.cpp @@ -79,6 +79,9 @@ private slots: void codecForHtml(); + void codecForUtfText_data(); + void codecForUtfText(); + #ifdef Q_OS_UNIX void toLocal8Bit(); #endif @@ -1744,6 +1747,62 @@ void tst_QTextCodec::codecForHtml() QCOMPARE(QTextCodec::codecForHtml(html, QTextCodec::codecForMib(106))->mibEnum(), 111); // latin 15 } +void tst_QTextCodec::codecForUtfText_data() +{ + QTest::addColumn("encoded"); + QTest::addColumn("detected"); + QTest::addColumn("mib"); + + + QTest::newRow("utf8 bom") + << QByteArray("\xef\xbb\xbfhello") + << true + << 106; + QTest::newRow("utf8 nobom") + << QByteArray("hello") + << false + << 0; + + QTest::newRow("utf16 bom be") + << QByteArray("\xfe\xff\0h\0e\0l", 8) + << true + << 1013; + QTest::newRow("utf16 bom le") + << QByteArray("\xff\xfeh\0e\0l\0", 8) + << true + << 1014; + QTest::newRow("utf16 nobom") + << QByteArray("\0h\0e\0l", 6) + << false + << 0; + + QTest::newRow("utf32 bom be") + << QByteArray("\0\0\xfe\xff\0\0\0h\0\0\0e\0\0\0l", 16) + << true + << 1018; + QTest::newRow("utf32 bom le") + << QByteArray("\xff\xfe\0\0h\0\0\0e\0\0\0l\0\0\0", 16) + << true + << 1019; + QTest::newRow("utf32 nobom") + << QByteArray("\0\0\0h\0\0\0e\0\0\0l", 12) + << false + << 0; +} + +void tst_QTextCodec::codecForUtfText() +{ + QFETCH(QByteArray, encoded); + QFETCH(bool, detected); + QFETCH(int, mib); + + QTextCodec *codec = QTextCodec::codecForUtfText(encoded, 0); + if (detected) + QCOMPARE(codec->mibEnum(), mib); + else + QVERIFY(codec == 0); +} + #ifdef Q_OS_UNIX void tst_QTextCodec::toLocal8Bit() { -- cgit v0.12