summaryrefslogtreecommitdiffstats
path: root/src/corelib/codecs/qtextcodec.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/corelib/codecs/qtextcodec.cpp')
-rw-r--r--src/corelib/codecs/qtextcodec.cpp77
1 files changed, 65 insertions, 12 deletions
diff --git a/src/corelib/codecs/qtextcodec.cpp b/src/corelib/codecs/qtextcodec.cpp
index 6e8ffa1..51ca43e 100644
--- a/src/corelib/codecs/qtextcodec.cpp
+++ b/src/corelib/codecs/qtextcodec.cpp
@@ -1508,9 +1508,14 @@ QString QTextDecoder::toUnicode(const QByteArray &ba)
/*!
\since 4.4
- Tries to detect the encoding of the provided snippet of HTML in the given byte array, \a ba,
- and returns a QTextCodec instance that is capable of decoding the html to unicode.
- If the codec cannot be detected from the content provided, \a defaultCodec is returned.
+ Tries to detect the encoding of the provided snippet of HTML in
+ the given byte array, \a ba, by checking the BOM (Byte Order Mark)
+ and the content-type meta header and returns a QTextCodec instance
+ that is capable of decoding the html to unicode. If the codec
+ cannot be detected from the content provided, \a defaultCodec is
+ returned.
+
+ \sa codecForUtfText
*/
QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec)
{
@@ -1518,15 +1523,8 @@ QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCo
int pos;
QTextCodec *c = 0;
- if (ba.size() > 1 && (((uchar)ba[0] == 0xfe && (uchar)ba[1] == 0xff)
- || ((uchar)ba[0] == 0xff && (uchar)ba[1] == 0xfe))) {
- c = QTextCodec::codecForMib(1015); // utf16
- } else if (ba.size() > 2
- && (uchar)ba[0] == 0xef
- && (uchar)ba[1] == 0xbb
- && (uchar)ba[2] == 0xbf) {
- c = QTextCodec::codecForMib(106); // utf-8
- } else {
+ c = QTextCodec::codecForUtfText(ba, c);
+ if (!c) {
QByteArray header = ba.left(512).toLower();
if ((pos = header.indexOf("http-equiv=")) != -1) {
pos = header.indexOf("charset=", pos) + int(strlen("charset="));
@@ -1554,6 +1552,61 @@ QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba)
return codecForHtml(ba, QTextCodec::codecForMib(/*Latin 1*/ 4));
}
+/*!
+ \since 4.6
+
+ Tries to detect the encoding of the provided snippet \a ba by
+ using the BOM (Byte Order Mark) and returns a QTextCodec instance
+ that is capable of decoding the text to unicode. If the codec
+ cannot be detected from the content provided, \a defaultCodec is
+ returned.
+
+ \sa codecForHtml
+*/
+QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba, QTextCodec *defaultCodec)
+{
+ const uint arraySize = ba.size();
+
+ if (arraySize > 3) {
+ if ((uchar)ba[0] == 0x00
+ && (uchar)ba[1] == 0x00
+ && (uchar)ba[2] == 0xFE
+ && (uchar)ba[3] == 0xFF)
+ return QTextCodec::codecForMib(1018); // utf-32 be
+ else if ((uchar)ba[0] == 0xFF
+ && (uchar)ba[1] == 0xFE
+ && (uchar)ba[2] == 0x00
+ && (uchar)ba[3] == 0x00)
+ return QTextCodec::codecForMib(1019); // utf-32 le
+ }
+
+ if (arraySize < 2)
+ return defaultCodec;
+ if ((uchar)ba[0] == 0xfe && (uchar)ba[1] == 0xff)
+ return QTextCodec::codecForMib(1013); // utf16 be
+ else if ((uchar)ba[0] == 0xff && (uchar)ba[1] == 0xfe)
+ return QTextCodec::codecForMib(1014); // utf16 le
+
+ if (arraySize < 3)
+ return defaultCodec;
+ if ((uchar)ba[0] == 0xef
+ && (uchar)ba[1] == 0xbb
+ && (uchar)ba[2] == 0xbf)
+ return QTextCodec::codecForMib(106); // utf-8
+
+ return defaultCodec;
+}
+
+/*!
+ \overload
+
+ If the codec cannot be detected, this overload returns a Latin-1 QTextCodec.
+*/
+QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba)
+{
+ return codecForUtfText(ba, QTextCodec::codecForMib(/*Latin 1*/ 4));
+}
+
/*! \internal
\since 4.3