diff options
author | Denis Dzyubenko <denis.dzyubenko@nokia.com> | 2009-04-28 12:08:59 (GMT) |
---|---|---|
committer | Denis Dzyubenko <denis.dzyubenko@nokia.com> | 2009-05-04 13:34:03 (GMT) |
commit | 7031e1d110bb1bc97cfe0377adc211030e1e7320 (patch) | |
tree | b1260d28e0865eaab3c91f1330a1df0dca2ba858 /src/corelib | |
parent | 1e0e67406c3865717fef8b98d2c69adbefc54245 (diff) | |
download | Qt-7031e1d110bb1bc97cfe0377adc211030e1e7320.zip Qt-7031e1d110bb1bc97cfe0377adc211030e1e7320.tar.gz Qt-7031e1d110bb1bc97cfe0377adc211030e1e7320.tar.bz2 |
When data was copied from Mozilla Firefox to Qt, the text format was not valid.
Mozilla encodes the text/html format in UTF16 and adds a BOM, however
it doesn't specify the charset in the html header. The fix is to guess
the encoding by either charset in the html header or BOM for text/html
format, or by BOM for non html formats.
This commit adds a new public function QTextCodec::codecForUtfText() which
can be used to guess encoding out of the BOM.
Task-number: 250555
Reviewed-by: Benjamin Poulain
Reviewed-by: Simon Hausmann
Reviewed-by: Andreas Aardal Hanssen
Diffstat (limited to 'src/corelib')
-rw-r--r-- | src/corelib/codecs/qtextcodec.cpp | 77 | ||||
-rw-r--r-- | src/corelib/codecs/qtextcodec.h | 3 |
2 files changed, 68 insertions, 12 deletions
diff --git a/src/corelib/codecs/qtextcodec.cpp b/src/corelib/codecs/qtextcodec.cpp index 6e8ffa1..51ca43e 100644 --- a/src/corelib/codecs/qtextcodec.cpp +++ b/src/corelib/codecs/qtextcodec.cpp @@ -1508,9 +1508,14 @@ QString QTextDecoder::toUnicode(const QByteArray &ba) /*! \since 4.4 - Tries to detect the encoding of the provided snippet of HTML in the given byte array, \a ba, - and returns a QTextCodec instance that is capable of decoding the html to unicode. - If the codec cannot be detected from the content provided, \a defaultCodec is returned. + Tries to detect the encoding of the provided snippet of HTML in + the given byte array, \a ba, by checking the BOM (Byte Order Mark) + and the content-type meta header and returns a QTextCodec instance + that is capable of decoding the html to unicode. If the codec + cannot be detected from the content provided, \a defaultCodec is + returned. + + \sa codecForUtfText */ QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec) { @@ -1518,15 +1523,8 @@ QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCo int pos; QTextCodec *c = 0; - if (ba.size() > 1 && (((uchar)ba[0] == 0xfe && (uchar)ba[1] == 0xff) - || ((uchar)ba[0] == 0xff && (uchar)ba[1] == 0xfe))) { - c = QTextCodec::codecForMib(1015); // utf16 - } else if (ba.size() > 2 - && (uchar)ba[0] == 0xef - && (uchar)ba[1] == 0xbb - && (uchar)ba[2] == 0xbf) { - c = QTextCodec::codecForMib(106); // utf-8 - } else { + c = QTextCodec::codecForUtfText(ba, c); + if (!c) { QByteArray header = ba.left(512).toLower(); if ((pos = header.indexOf("http-equiv=")) != -1) { pos = header.indexOf("charset=", pos) + int(strlen("charset=")); @@ -1554,6 +1552,61 @@ QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba) return codecForHtml(ba, QTextCodec::codecForMib(/*Latin 1*/ 4)); } +/*! + \since 4.6 + + Tries to detect the encoding of the provided snippet \a ba by + using the BOM (Byte Order Mark) and returns a QTextCodec instance + that is capable of decoding the text to unicode. If the codec + cannot be detected from the content provided, \a defaultCodec is + returned. + + \sa codecForHtml +*/ +QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba, QTextCodec *defaultCodec) +{ + const uint arraySize = ba.size(); + + if (arraySize > 3) { + if ((uchar)ba[0] == 0x00 + && (uchar)ba[1] == 0x00 + && (uchar)ba[2] == 0xFE + && (uchar)ba[3] == 0xFF) + return QTextCodec::codecForMib(1018); // utf-32 be + else if ((uchar)ba[0] == 0xFF + && (uchar)ba[1] == 0xFE + && (uchar)ba[2] == 0x00 + && (uchar)ba[3] == 0x00) + return QTextCodec::codecForMib(1019); // utf-32 le + } + + if (arraySize < 2) + return defaultCodec; + if ((uchar)ba[0] == 0xfe && (uchar)ba[1] == 0xff) + return QTextCodec::codecForMib(1013); // utf16 be + else if ((uchar)ba[0] == 0xff && (uchar)ba[1] == 0xfe) + return QTextCodec::codecForMib(1014); // utf16 le + + if (arraySize < 3) + return defaultCodec; + if ((uchar)ba[0] == 0xef + && (uchar)ba[1] == 0xbb + && (uchar)ba[2] == 0xbf) + return QTextCodec::codecForMib(106); // utf-8 + + return defaultCodec; +} + +/*! + \overload + + If the codec cannot be detected, this overload returns a Latin-1 QTextCodec. +*/ +QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba) +{ + return codecForUtfText(ba, QTextCodec::codecForMib(/*Latin 1*/ 4)); +} + /*! \internal \since 4.3 diff --git a/src/corelib/codecs/qtextcodec.h b/src/corelib/codecs/qtextcodec.h index e32650f..83097a5 100644 --- a/src/corelib/codecs/qtextcodec.h +++ b/src/corelib/codecs/qtextcodec.h @@ -82,6 +82,9 @@ public: static QTextCodec *codecForHtml(const QByteArray &ba); static QTextCodec *codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec); + static QTextCodec *codecForUtfText(const QByteArray &ba); + static QTextCodec *codecForUtfText(const QByteArray &ba, QTextCodec *defaultCodec); + QTextDecoder* makeDecoder() const; QTextEncoder* makeEncoder() const; |