diff options
author | Thiago Macieira <thiago.macieira@nokia.com> | 2010-03-05 13:10:57 (GMT) |
---|---|---|
committer | Thiago Macieira <thiago.macieira@nokia.com> | 2010-03-05 15:57:27 (GMT) |
commit | 8b30b72948d5f16d59f8799f3889f563e2c6e24d (patch) | |
tree | 6279bd2f3faeb0be880225dc787858b98d3a935f /src/corelib/codecs | |
parent | 4f02ca5c9458299185d447da87058d6e8ca1b260 (diff) | |
download | Qt-8b30b72948d5f16d59f8799f3889f563e2c6e24d.zip Qt-8b30b72948d5f16d59f8799f3889f563e2c6e24d.tar.gz Qt-8b30b72948d5f16d59f8799f3889f563e2c6e24d.tar.bz2 |
Make the UTF-8 encoder/decoder not accept Unicode non-characters
Reviewed-By: Denis Dzyubenko
Diffstat (limited to 'src/corelib/codecs')
-rw-r--r-- | src/corelib/codecs/qutfcodec.cpp | 28 |
1 files changed, 25 insertions, 3 deletions
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp index 742b2e7..233bd8f 100644 --- a/src/corelib/codecs/qutfcodec.cpp +++ b/src/corelib/codecs/qutfcodec.cpp @@ -48,6 +48,19 @@ QT_BEGIN_NAMESPACE enum { Endian = 0, Data = 1 }; +static inline bool isUnicodeNonCharacter(uint ucs4) +{ + // Unicode has a couple of "non-characters" that one can use internally, + // but are not allowed to be used for text interchange. + // + // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF, + // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and + // U+FDEF (inclusive) + + return (ucs4 & 0xfffe) == 0xfffe + || (ucs4 - 0xfdd0U) < 16; +} + QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state) { uchar replacement = '?'; @@ -106,6 +119,14 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve if (u < 0x0800) { *cursor++ = 0xc0 | ((uchar) (u >> 6)); } else { + // is it one of the Unicode non-characters? + if (isUnicodeNonCharacter(u)) { + *cursor = replacement; + ++ch; + ++invalid; + continue; + } + if (u > 0xffff) { *cursor++ = 0xf0 | ((uchar) (u >> 18)); *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f); @@ -172,15 +193,16 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte --need; if (!need) { // utf-8 bom composes into 0xfeff code point + bool nonCharacter; if (!headerdone && uc == 0xfeff) { // dont do anything, just skip the BOM - } else if (uc > 0xffff && uc < 0x110000) { + } else if (!(nonCharacter = isUnicodeNonCharacter(uc)) && uc > 0xffff && uc < 0x110000) { // surrogate pair Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length()); *qch++ = QChar::highSurrogate(uc); *qch++ = QChar::lowSurrogate(uc); - } else if ((uc < min_uc) || (uc >= 0xd800 && uc <= 0xdfff) || (uc >= 0xfffe)) { - // error: overlong sequence, UTF16 surrogate or BOM + } else if ((uc < min_uc) || (uc >= 0xd800 && uc <= 0xdfff) || nonCharacter || uc >= 0x110000) { + // error: overlong sequence, UTF16 surrogate or non-character *qch++ = replacement; ++invalid; } else { |