summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDenis Dzyubenko <denis.dzyubenko@nokia.com>2009-04-28 12:08:59 (GMT)
committerDenis Dzyubenko <denis.dzyubenko@nokia.com>2009-05-04 13:34:03 (GMT)
commit7031e1d110bb1bc97cfe0377adc211030e1e7320 (patch)
treeb1260d28e0865eaab3c91f1330a1df0dca2ba858
parent1e0e67406c3865717fef8b98d2c69adbefc54245 (diff)
downloadQt-7031e1d110bb1bc97cfe0377adc211030e1e7320.zip
Qt-7031e1d110bb1bc97cfe0377adc211030e1e7320.tar.gz
Qt-7031e1d110bb1bc97cfe0377adc211030e1e7320.tar.bz2
When data was copied from Mozilla Firefox to Qt, the text format was not valid.
Mozilla encodes the text/html format in UTF16 and adds a BOM, however it doesn't specify the charset in the html header. The fix is to guess the encoding by either charset in the html header or BOM for text/html format, or by BOM for non html formats. This commit adds a new public function QTextCodec::codecForUtfText() which can be used to guess encoding out of the BOM. Task-number: 250555 Reviewed-by: Benjamin Poulain Reviewed-by: Simon Hausmann Reviewed-by: Andreas Aardal Hanssen
-rw-r--r--src/corelib/codecs/qtextcodec.cpp77
-rw-r--r--src/corelib/codecs/qtextcodec.h3
-rw-r--r--src/gui/kernel/qclipboard.cpp24
-rw-r--r--tests/auto/qtextcodec/tst_qtextcodec.cpp59
4 files changed, 144 insertions, 19 deletions
diff --git a/src/corelib/codecs/qtextcodec.cpp b/src/corelib/codecs/qtextcodec.cpp
index 6e8ffa1..51ca43e 100644
--- a/src/corelib/codecs/qtextcodec.cpp
+++ b/src/corelib/codecs/qtextcodec.cpp
@@ -1508,9 +1508,14 @@ QString QTextDecoder::toUnicode(const QByteArray &ba)
/*!
\since 4.4
- Tries to detect the encoding of the provided snippet of HTML in the given byte array, \a ba,
- and returns a QTextCodec instance that is capable of decoding the html to unicode.
- If the codec cannot be detected from the content provided, \a defaultCodec is returned.
+ Tries to detect the encoding of the provided snippet of HTML in
+ the given byte array, \a ba, by checking the BOM (Byte Order Mark)
+ and the content-type meta header and returns a QTextCodec instance
+ that is capable of decoding the html to unicode. If the codec
+ cannot be detected from the content provided, \a defaultCodec is
+ returned.
+
+ \sa codecForUtfText
*/
QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec)
{
@@ -1518,15 +1523,8 @@ QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCo
int pos;
QTextCodec *c = 0;
- if (ba.size() > 1 && (((uchar)ba[0] == 0xfe && (uchar)ba[1] == 0xff)
- || ((uchar)ba[0] == 0xff && (uchar)ba[1] == 0xfe))) {
- c = QTextCodec::codecForMib(1015); // utf16
- } else if (ba.size() > 2
- && (uchar)ba[0] == 0xef
- && (uchar)ba[1] == 0xbb
- && (uchar)ba[2] == 0xbf) {
- c = QTextCodec::codecForMib(106); // utf-8
- } else {
+ c = QTextCodec::codecForUtfText(ba, c);
+ if (!c) {
QByteArray header = ba.left(512).toLower();
if ((pos = header.indexOf("http-equiv=")) != -1) {
pos = header.indexOf("charset=", pos) + int(strlen("charset="));
@@ -1554,6 +1552,61 @@ QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba)
return codecForHtml(ba, QTextCodec::codecForMib(/*Latin 1*/ 4));
}
+/*!
+ \since 4.6
+
+ Tries to detect the encoding of the provided snippet \a ba by
+ using the BOM (Byte Order Mark) and returns a QTextCodec instance
+ that is capable of decoding the text to unicode. If the codec
+ cannot be detected from the content provided, \a defaultCodec is
+ returned.
+
+ \sa codecForHtml
+*/
+QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba, QTextCodec *defaultCodec)
+{
+ const uint arraySize = ba.size();
+
+ if (arraySize > 3) {
+ if ((uchar)ba[0] == 0x00
+ && (uchar)ba[1] == 0x00
+ && (uchar)ba[2] == 0xFE
+ && (uchar)ba[3] == 0xFF)
+ return QTextCodec::codecForMib(1018); // utf-32 be
+ else if ((uchar)ba[0] == 0xFF
+ && (uchar)ba[1] == 0xFE
+ && (uchar)ba[2] == 0x00
+ && (uchar)ba[3] == 0x00)
+ return QTextCodec::codecForMib(1019); // utf-32 le
+ }
+
+ if (arraySize < 2)
+ return defaultCodec;
+ if ((uchar)ba[0] == 0xfe && (uchar)ba[1] == 0xff)
+ return QTextCodec::codecForMib(1013); // utf16 be
+ else if ((uchar)ba[0] == 0xff && (uchar)ba[1] == 0xfe)
+ return QTextCodec::codecForMib(1014); // utf16 le
+
+ if (arraySize < 3)
+ return defaultCodec;
+ if ((uchar)ba[0] == 0xef
+ && (uchar)ba[1] == 0xbb
+ && (uchar)ba[2] == 0xbf)
+ return QTextCodec::codecForMib(106); // utf-8
+
+ return defaultCodec;
+}
+
+/*!
+ \overload
+
+ If the codec cannot be detected, this overload returns a Latin-1 QTextCodec.
+*/
+QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba)
+{
+ return codecForUtfText(ba, QTextCodec::codecForMib(/*Latin 1*/ 4));
+}
+
/*! \internal
\since 4.3
diff --git a/src/corelib/codecs/qtextcodec.h b/src/corelib/codecs/qtextcodec.h
index e32650f..83097a5 100644
--- a/src/corelib/codecs/qtextcodec.h
+++ b/src/corelib/codecs/qtextcodec.h
@@ -82,6 +82,9 @@ public:
static QTextCodec *codecForHtml(const QByteArray &ba);
static QTextCodec *codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec);
+ static QTextCodec *codecForUtfText(const QByteArray &ba);
+ static QTextCodec *codecForUtfText(const QByteArray &ba, QTextCodec *defaultCodec);
+
QTextDecoder* makeDecoder() const;
QTextEncoder* makeEncoder() const;
diff --git a/src/gui/kernel/qclipboard.cpp b/src/gui/kernel/qclipboard.cpp
index 917b5d5..6daf433 100644
--- a/src/gui/kernel/qclipboard.cpp
+++ b/src/gui/kernel/qclipboard.cpp
@@ -50,6 +50,7 @@
#include "qvariant.h"
#include "qbuffer.h"
#include "qimage.h"
+#include "qtextcodec.h"
QT_BEGIN_NAMESPACE
@@ -276,11 +277,12 @@ QClipboard::~QClipboard()
*/
QString QClipboard::text(QString &subtype, Mode mode) const
{
- const QMimeData *data = mimeData(mode);
+ const QMimeData *const data = mimeData(mode);
if (!data)
return QString();
+
+ const QStringList formats = data->formats();
if (subtype.isEmpty()) {
- QStringList formats = data->formats();
if (formats.contains(QLatin1String("text/plain")))
subtype = QLatin1String("plain");
else {
@@ -289,13 +291,21 @@ QString QClipboard::text(QString &subtype, Mode mode) const
subtype = formats.at(i).mid(5);
break;
}
+ if (subtype.isEmpty())
+ return QString();
}
- }
- if (subtype.isEmpty())
+ } else if (!formats.contains(QLatin1String("text/") + subtype)) {
return QString();
- if (subtype == QLatin1String("plain"))
- return data->text();
- return QString::fromUtf8(data->data(QLatin1String("text/") + subtype));
+ }
+
+ const QByteArray rawData = data->data(QLatin1String("text/") + subtype);
+
+ QTextCodec* codec = QTextCodec::codecForMib(106); // utf-8 is default
+ if (subtype == QLatin1String("html"))
+ codec = QTextCodec::codecForHtml(rawData, codec);
+ else
+ codec = QTextCodec::codecForUtfText(rawData, codec);
+ return codec->toUnicode(rawData);
}
/*!
diff --git a/tests/auto/qtextcodec/tst_qtextcodec.cpp b/tests/auto/qtextcodec/tst_qtextcodec.cpp
index cf4135b..22f9557 100644
--- a/tests/auto/qtextcodec/tst_qtextcodec.cpp
+++ b/tests/auto/qtextcodec/tst_qtextcodec.cpp
@@ -79,6 +79,9 @@ private slots:
void codecForHtml();
+ void codecForUtfText_data();
+ void codecForUtfText();
+
#ifdef Q_OS_UNIX
void toLocal8Bit();
#endif
@@ -1744,6 +1747,62 @@ void tst_QTextCodec::codecForHtml()
QCOMPARE(QTextCodec::codecForHtml(html, QTextCodec::codecForMib(106))->mibEnum(), 111); // latin 15
}
+void tst_QTextCodec::codecForUtfText_data()
+{
+ QTest::addColumn<QByteArray>("encoded");
+ QTest::addColumn<bool>("detected");
+ QTest::addColumn<int>("mib");
+
+
+ QTest::newRow("utf8 bom")
+ << QByteArray("\xef\xbb\xbfhello")
+ << true
+ << 106;
+ QTest::newRow("utf8 nobom")
+ << QByteArray("hello")
+ << false
+ << 0;
+
+ QTest::newRow("utf16 bom be")
+ << QByteArray("\xfe\xff\0h\0e\0l", 8)
+ << true
+ << 1013;
+ QTest::newRow("utf16 bom le")
+ << QByteArray("\xff\xfeh\0e\0l\0", 8)
+ << true
+ << 1014;
+ QTest::newRow("utf16 nobom")
+ << QByteArray("\0h\0e\0l", 6)
+ << false
+ << 0;
+
+ QTest::newRow("utf32 bom be")
+ << QByteArray("\0\0\xfe\xff\0\0\0h\0\0\0e\0\0\0l", 16)
+ << true
+ << 1018;
+ QTest::newRow("utf32 bom le")
+ << QByteArray("\xff\xfe\0\0h\0\0\0e\0\0\0l\0\0\0", 16)
+ << true
+ << 1019;
+ QTest::newRow("utf32 nobom")
+ << QByteArray("\0\0\0h\0\0\0e\0\0\0l", 12)
+ << false
+ << 0;
+}
+
+void tst_QTextCodec::codecForUtfText()
+{
+ QFETCH(QByteArray, encoded);
+ QFETCH(bool, detected);
+ QFETCH(int, mib);
+
+ QTextCodec *codec = QTextCodec::codecForUtfText(encoded, 0);
+ if (detected)
+ QCOMPARE(codec->mibEnum(), mib);
+ else
+ QVERIFY(codec == 0);
+}
+
#ifdef Q_OS_UNIX
void tst_QTextCodec::toLocal8Bit()
{