From 2916b0749902ee51a3deab982e53189e72c3b693 Mon Sep 17 00:00:00 2001 From: Denis Dzyubenko Date: Mon, 23 Aug 2010 09:07:11 +0200 Subject: Added encoding conversion functions to QStringRef. Reviewed-by: Olivier Goffart --- src/corelib/tools/qstring.cpp | 160 ++++++++++++++++++++++++++++++------- src/corelib/tools/qstring.h | 6 ++ tests/auto/qstring/tst_qstring.cpp | 66 +++++++++++++++ 3 files changed, 203 insertions(+), 29 deletions(-) diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp index 7ea9877..de45a63 100644 --- a/src/corelib/tools/qstring.cpp +++ b/src/corelib/tools/qstring.cpp @@ -927,6 +927,24 @@ QString QString::fromWCharArray(const wchar_t *string, int size) \sa utf16(), toAscii(), toLatin1(), toUtf8(), toLocal8Bit() */ +template int toUcs4_helper(const unsigned short *uc, int length, T *out) +{ + int i = 0; + for (; i < length; ++i) { + uint u = uc[i]; + if (QChar::isHighSurrogate(u) && i < length-1) { + ushort low = uc[i+1]; + if (QChar::isLowSurrogate(low)) { + ++i; + u = QChar::surrogateToUcs4(u, low); + } + } + *out = T(u); + ++out; + } + return i; +} + /*! \since 4.2 @@ -951,21 +969,7 @@ int QString::toWCharArray(wchar_t *array) const memcpy(array, utf16(), sizeof(wchar_t)*length()); return length(); } else { - wchar_t *a = array; - const unsigned short *uc = utf16(); - for (int i = 0; i < length(); ++i) { - uint u = uc[i]; - if (QChar::isHighSurrogate(u) && i + 1 < length()) { - ushort low = uc[i+1]; - if (QChar::isLowSurrogate(low)) { - u = QChar::surrogateToUcs4(u, low); - ++i; - } - } - *a = wchar_t(u); - ++a; - } - return a - array; + return toUcs4_helper(utf16(), length(), array); } } @@ -3709,20 +3713,8 @@ QVector QString::toUcs4() const { QVector v(length()); uint *a = v.data(); - const unsigned short *uc = utf16(); - for (int i = 0; i < length(); ++i) { - uint u = uc[i]; - if (QChar(u).isHighSurrogate() && i < length()-1) { - ushort low = uc[i+1]; - if (QChar(low).isLowSurrogate()) { - ++i; - u = QChar::surrogateToUcs4(u, low); - } - } - *a = u; - ++a; - } - v.resize(a - v.data()); + int len = toUcs4_helper(utf16(), length(), a); + v.resize(len); return v; } @@ -8961,4 +8953,114 @@ static inline bool qt_ends_with(const QChar *haystack, int haystackLen, return true; } +/*! + \since 4.8 + + Returns a Latin-1 representation of the string as a QByteArray. + + The returned byte array is undefined if the string contains non-Latin1 + characters. Those characters may be suppressed or replaced with a + question mark. + + \sa toAscii(), toUtf8(), toLocal8Bit(), QTextCodec +*/ +QByteArray QStringRef::toLatin1() const +{ + return toLatin1_helper(unicode(), length()); +} + +/*! + \since 4.8 + + Returns an 8-bit representation of the string as a QByteArray. + + If a codec has been set using QTextCodec::setCodecForCStrings(), + it is used to convert Unicode to 8-bit char; otherwise this + function does the same as toLatin1(). + + Note that, despite the name, this function does not necessarily return an US-ASCII + (ANSI X3.4-1986) string and its result may not be US-ASCII compatible. + + \sa toLatin1(), toUtf8(), toLocal8Bit(), QTextCodec +*/ +QByteArray QStringRef::toAscii() const +{ +#ifndef QT_NO_TEXTCODEC + if (QString::codecForCStrings) + return QString::codecForCStrings->fromUnicode(unicode(), length()); +#endif // QT_NO_TEXTCODEC + return toLatin1(); +} + +/*! + \since 4.8 + + Returns the local 8-bit representation of the string as a + QByteArray. The returned byte array is undefined if the string + contains characters not supported by the local 8-bit encoding. + + QTextCodec::codecForLocale() is used to perform the conversion from + Unicode. If the locale encoding could not be determined, this function + does the same as toLatin1(). + + If this string contains any characters that cannot be encoded in the + locale, the returned byte array is undefined. Those characters may be + suppressed or replaced by another. + + \sa toAscii(), toLatin1(), toUtf8(), QTextCodec +*/ +QByteArray QStringRef::toLocal8Bit() const +{ +#ifndef QT_NO_TEXTCODEC + if (QTextCodec::codecForLocale()) + return QTextCodec::codecForLocale()->fromUnicode(unicode(), length()); +#endif // QT_NO_TEXTCODEC + return toLatin1(); +} + +/*! + \since 4.8 + + Returns a UTF-8 representation of the string as a QByteArray. + + UTF-8 is a Unicode codec and can represent all characters in a Unicode + string like QString. + + However, in the Unicode range, there are certain codepoints that are not + considered characters. The Unicode standard reserves the last two + codepoints in each Unicode Plane (U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, + U+2FFFE, etc.), as well as 16 codepoints in the range U+FDD0..U+FDDF, + inclusive, as non-characters. If any of those appear in the string, they + may be discarded and will not appear in the UTF-8 representation, or they + may be replaced by one or more replacement characters. + + \sa toAscii(), toLatin1(), toLocal8Bit(), QTextCodec +*/ +QByteArray QStringRef::toUtf8() const +{ + if (isNull()) + return QByteArray(); + + return QUtf8::convertFromUnicode(constData(), length(), 0); +} + +/*! + \since 4.8 + + Returns a UCS-4/UTF-32 representation of the string as a QVector. + + UCS-4 is a Unicode codec and is lossless. All characters from this string + can be encoded in UCS-4. + + \sa fromUtf8(), toAscii(), toLatin1(), toLocal8Bit(), QTextCodec, fromUcs4(), toWCharArray() +*/ +QVector QStringRef::toUcs4() const +{ + QVector v(length()); + uint *a = v.data(); + int len = toUcs4_helper(reinterpret_cast(unicode()), length(), a); + v.resize(len); + return v; +} + QT_END_NAMESPACE diff --git a/src/corelib/tools/qstring.h b/src/corelib/tools/qstring.h index 7f7e475..2252353 100644 --- a/src/corelib/tools/qstring.h +++ b/src/corelib/tools/qstring.h @@ -1161,6 +1161,12 @@ public: inline const QChar *data() const { return unicode(); } inline const QChar *constData() const { return unicode(); } + QByteArray toAscii() const Q_REQUIRED_RESULT; + QByteArray toLatin1() const Q_REQUIRED_RESULT; + QByteArray toUtf8() const Q_REQUIRED_RESULT; + QByteArray toLocal8Bit() const Q_REQUIRED_RESULT; + QVector toUcs4() const Q_REQUIRED_RESULT; + inline void clear() { m_string = 0; m_position = m_size = 0; } QString toString() const; inline bool isEmpty() const { return m_size == 0; } diff --git a/tests/auto/qstring/tst_qstring.cpp b/tests/auto/qstring/tst_qstring.cpp index a3f7f15..af6b371 100644 --- a/tests/auto/qstring/tst_qstring.cpp +++ b/tests/auto/qstring/tst_qstring.cpp @@ -174,6 +174,12 @@ private slots: void fromLatin1Roundtrip(); void toLatin1Roundtrip_data(); void toLatin1Roundtrip(); + void stringRef_toLatin1Roundtrip_data(); + void stringRef_toLatin1Roundtrip(); + void stringRef_utf8_data(); + void stringRef_utf8(); + void stringRef_local8Bit_data(); + void stringRef_local8Bit(); void fromLatin1(); void fromAscii(); void arg(); @@ -3128,6 +3134,20 @@ void tst_QString::utf8() QCOMPARE( utf8, QByteArray(res.toUtf8()) ); } +void tst_QString::stringRef_utf8_data() +{ + utf8_data(); +} + +void tst_QString::stringRef_utf8() +{ + QFETCH( QByteArray, utf8 ); + QFETCH( QString, res ); + + QStringRef ref(&res, 0, res.length()); + QCOMPARE( utf8, QByteArray(ref.toUtf8()) ); +} + // copied to tst_QTextCodec::utf8Codec_data() void tst_QString::fromUtf8_data() { @@ -3307,6 +3327,20 @@ void tst_QString::local8Bit() QCOMPARE(local8Bit.toLocal8Bit(), QByteArray(result)); } +void tst_QString::stringRef_local8Bit_data() +{ + local8Bit_data(); +} + +void tst_QString::stringRef_local8Bit() +{ + QFETCH(QString, local8Bit); + QFETCH(QByteArray, result); + + QStringRef ref(&local8Bit, 0, local8Bit.length()); + QCOMPARE(ref.toLocal8Bit(), QByteArray(result)); +} + void tst_QString::fromLatin1Roundtrip_data() { QTest::addColumn("latin1"); @@ -3410,6 +3444,38 @@ void tst_QString::toLatin1Roundtrip() QCOMPARE(QString::fromLatin1(latin1, latin1.length()), unicodedst); } +void tst_QString::stringRef_toLatin1Roundtrip_data() +{ + toLatin1Roundtrip_data(); +} + +void tst_QString::stringRef_toLatin1Roundtrip() +{ + QFETCH(QByteArray, latin1); + QFETCH(QString, unicodesrc); + QFETCH(QString, unicodedst); + + // QtTest safety check: + Q_ASSERT(latin1.isNull() == unicodesrc.isNull()); + Q_ASSERT(latin1.isEmpty() == unicodesrc.isEmpty()); + Q_ASSERT(latin1.length() == unicodesrc.length()); + Q_ASSERT(latin1.isNull() == unicodedst.isNull()); + Q_ASSERT(latin1.isEmpty() == unicodedst.isEmpty()); + Q_ASSERT(latin1.length() == unicodedst.length()); + + if (!latin1.isEmpty()) + while (latin1.length() < 128) { + latin1 += latin1; + unicodesrc += unicodesrc; + unicodedst += unicodedst; + } + + // toLatin1 + QStringRef src(&unicodesrc, 0, unicodesrc.length()); + QCOMPARE(src.toLatin1().length(), latin1.length()); + QCOMPARE(src.toLatin1(), latin1); +} + void tst_QString::fromLatin1() { QString a; -- cgit v0.12