diff options
Diffstat (limited to 'src/corelib/tools/qstring.cpp')
-rw-r--r-- | src/corelib/tools/qstring.cpp | 177 |
1 files changed, 96 insertions, 81 deletions
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp index eff87e8..5ed3db5 100644 --- a/src/corelib/tools/qstring.cpp +++ b/src/corelib/tools/qstring.cpp @@ -335,7 +335,7 @@ const QString::Null QString::null = { }; \macro QT_NO_CAST_TO_ASCII \relates QString - disables automatic conversion from QString to ASCII 8-bit strings (char *) + disables automatic conversion from QString to 8-bit strings (char *) \sa QT_NO_CAST_FROM_ASCII, QT_NO_CAST_FROM_BYTEARRAY */ @@ -391,10 +391,10 @@ const QString::Null QString::null = { }; with code values above 65535 are stored using surrogate pairs, i.e., two consecutive \l{QChar}s.) - \l{Unicode} is an international standard that supports most of - the writing systems in use today. It is a superset of ASCII and - Latin-1 (ISO 8859-1), and all the ASCII/Latin-1 characters are - available at the same code positions. + \l{Unicode} is an international standard that supports most of the + writing systems in use today. It is a superset of US-ASCII (ANSI + X3.4-1986) and Latin-1 (ISO 8859-1), and all the US-ASCII/Latin-1 + characters are available at the same code positions. Behind the scenes, QString uses \l{implicit sharing} (copy-on-write) to reduce memory usage and to avoid the needless @@ -562,11 +562,13 @@ const QString::Null QString::null = { }; toLatin1(), toUtf8(), and toLocal8Bit(). \list - \o toAscii() returns an ASCII encoded 8-bit string. + \o toAscii() returns an 8-bit string encoded using the codec + specified by QTextCodec::codecForCStrings (by default, that is + Latin 1). \o toLatin1() returns a Latin-1 (ISO 8859-1) encoded 8-bit string. \o toUtf8() returns a UTF-8 encoded 8-bit string. UTF-8 is a - superset of ASCII that supports the entire Unicode character - set through multibyte sequences. + superset of US-ASCII (ANSI X3.4-1986) that supports the entire + Unicode character set through multibyte sequences. \o toLocal8Bit() returns an 8-bit string using the system's local encoding. \endlist @@ -578,7 +580,7 @@ const QString::Null QString::null = { }; As mentioned above, QString provides a lot of functions and operators that make it easy to interoperate with \c{const char *} strings. But this functionality is a double-edged sword: It makes - QString more convenient to use if all strings are ASCII or + QString more convenient to use if all strings are US-ASCII or Latin-1, but there is always the risk that an implicit conversion from or to \c{const char *} is done using the wrong 8-bit encoding. To minimize these risks, you can turn off these implicit @@ -586,9 +588,9 @@ const QString::Null QString::null = { }; \list \o \c QT_NO_CAST_FROM_ASCII disables automatic conversions from - ASCII to Unicode. + C string literals and pointers to Unicode. \o \c QT_NO_CAST_TO_ASCII disables automatic conversion from QString - to ASCII. + to C strings. \endlist One way to define these preprocessor symbols globally for your @@ -837,7 +839,7 @@ int QString::grow(int size) /*! \fn QString::QString(const char *str) - Constructs a string initialized with the ASCII string \a str. The + Constructs a string initialized with the 8-bit string \a str. The given const char pointer is converted to Unicode using the fromAscii() function. @@ -1337,8 +1339,9 @@ QString &QString::operator=(const QString &other) \overload operator=() - Assigns \a ba to this string. The byte array is converted to - Unicode using the fromAscii() function. + Assigns \a ba to this string. The byte array is converted to Unicode + using the fromAscii() function. This function stops conversion at the + first NUL character found, or the end of the \a ba byte array. You can disable this operator by defining \c QT_NO_CAST_FROM_ASCII when you compile your applications. This @@ -2131,7 +2134,8 @@ bool QString::operator==(const QLatin1String &other) const \overload operator==() The \a other byte array is converted to a QString using the - fromAscii() function. + fromAscii() function. This function stops conversion at the + first NUL character found, or the end of the \a ba byte array. You can disable this operator by defining \c QT_NO_CAST_FROM_ASCII when you compile your applications. This @@ -2192,7 +2196,8 @@ bool QString::operator<(const QLatin1String &other) const \overload operator<() The \a other byte array is converted to a QString using the - fromAscii() function. + fromAscii() function. If any NUL characters ('\0') are embedded + in the \a ba byte array, they will be included in the transformation. You can disable this operator by defining \c QT_NO_CAST_FROM_ASCII when you compile your applications. This @@ -2234,7 +2239,8 @@ bool QString::operator<(const QLatin1String &other) const \overload operator<=() The \a other byte array is converted to a QString using the - fromAscii() function. + fromAscii() function. If any NUL characters ('\0') are embedded + in the \a ba byte array, they will be included in the transformation. You can disable this operator by defining \c QT_NO_CAST_FROM_ASCII when you compile your applications. This @@ -2292,7 +2298,8 @@ bool QString::operator>(const QLatin1String &other) const \overload operator>() The \a other byte array is converted to a QString using the - fromAscii() function. + fromAscii() function. If any NUL characters ('\0') are embedded + in the \a ba byte array, they will be included in the transformation. You can disable this operator by defining \c QT_NO_CAST_FROM_ASCII when you compile your applications. This @@ -2334,7 +2341,8 @@ bool QString::operator>(const QLatin1String &other) const \overload operator>=() The \a other byte array is converted to a QString using the - fromAscii() function. + fromAscii() function. If any NUL characters ('\0') are embedded + in the \a ba byte array, they will be included in the transformation. You can disable this operator by defining \c QT_NO_CAST_FROM_ASCII when you compile your applications. This can be useful if you want @@ -2376,7 +2384,8 @@ bool QString::operator>(const QLatin1String &other) const \overload operator!=() The \a other byte array is converted to a QString using the - fromAscii() function. + fromAscii() function. If any NUL characters ('\0') are embedded + in the \a ba byte array, they will be included in the transformation. You can disable this operator by defining \c QT_NO_CAST_FROM_ASCII when you compile your applications. This can be useful if you want @@ -3563,8 +3572,10 @@ static QByteArray toLatin1_helper(const QChar *data, int length) /*! Returns a Latin-1 representation of the string as a QByteArray. - The returned byte array is undefined if the string contains - non-Latin1 characters. + + The returned byte array is undefined if the string contains non-Latin1 + characters. Those characters may be suppressed or replaced with a + question mark. \sa fromLatin1(), toAscii(), toUtf8(), toLocal8Bit(), QTextCodec */ @@ -3578,12 +3589,15 @@ QByteArray QString::toLatin1() const // isn't necessary in the header. See task 177402. /*! - Returns an 8-bit ASCII representation of the string as a QByteArray. + Returns an 8-bit representation of the string as a QByteArray. If a codec has been set using QTextCodec::setCodecForCStrings(), it is used to convert Unicode to 8-bit char; otherwise this function does the same as toLatin1(). + Note that, despite the name, this function does not necessarily return an US-ASCII + (ANSI X3.4-1986) string and its result may not be US-ASCII compatible. + \sa fromAscii(), toLatin1(), toUtf8(), toLocal8Bit(), QTextCodec */ QByteArray QString::toAscii() const @@ -3611,8 +3625,13 @@ static QByteArray toLocal8Bit_helper(const QChar *data, int length) QByteArray. The returned byte array is undefined if the string contains characters not supported by the local 8-bit encoding. - QTextCodec::codecForLocale() is used to perform the conversion - from Unicode. + QTextCodec::codecForLocale() is used to perform the conversion from + Unicode. If the locale encoding could not be determined, this function + does the same as toLatin1(). + + If this string contains any characters that cannot be encoded in the + locale, the returned byte array is undefined. Those characters may be + suppressed or replaced by another. \sa fromLocal8Bit(), toAscii(), toLatin1(), toUtf8(), QTextCodec */ @@ -3628,54 +3647,34 @@ QByteArray QString::toLocal8Bit() const /*! Returns a UTF-8 representation of the string as a QByteArray. + UTF-8 is a Unicode codec and can represent all characters in a Unicode + string like QString. + + However, in the Unicode range, there are certain codepoints that are not + considered characters. The Unicode standard reserves the last two + codepoints in each Unicode Plane (U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, + U+2FFFE, etc.), as well as 16 codepoints in the range U+FDD0..U+FDDF, + inclusive, as non-characters. If any of those appear in the string, they + may be discarded and will not appear in the UTF-8 representation, or they + may be replaced by one or more replacement characters. + \sa fromUtf8(), toAscii(), toLatin1(), toLocal8Bit(), QTextCodec */ QByteArray QString::toUtf8() const { - QByteArray ba; - if (d->size) { - int l = d->size; - int rlen = l*3+1; - ba.resize(rlen); - uchar *cursor = (uchar*)ba.data(); - const ushort *ch =d->data; - for (int i=0; i < l; i++) { - uint u = *ch; - if (u < 0x80) { - *cursor++ = (uchar)u; - } else { - if (u < 0x0800) { - *cursor++ = 0xc0 | ((uchar) (u >> 6)); - } else { - if (QChar(u).isHighSurrogate() && i < l-1) { - ushort low = ch[1]; - if (QChar(low).isLowSurrogate()) { - ++ch; - ++i; - u = QChar::surrogateToUcs4(u,low); - } - } - if (u > 0xffff) { - *cursor++ = 0xf0 | ((uchar) (u >> 18)); - *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f); - } else { - *cursor++ = 0xe0 | ((uchar) (u >> 12)); - } - *cursor++ = 0x80 | (((uchar) (u >> 6)) & 0x3f); - } - *cursor++ = 0x80 | ((uchar) (u&0x3f)); - } - ++ch; - } - ba.resize(cursor - (uchar*)ba.constData()); - } - return ba; + if (isNull()) + return QByteArray(); + + return QUtf8::convertFromUnicode(constData(), length(), 0); } /*! \since 4.2 - Returns a UCS-4 representation of the string as a QVector<uint>. + Returns a UCS-4/UTF-32 representation of the string as a QVector<uint>. + + UCS-4 is a Unicode codec and is lossless. All characters from this string + can be encoded in UCS-4. \sa fromUtf8(), toAscii(), toLatin1(), toLocal8Bit(), QTextCodec, fromUcs4(), toWCharArray() */ @@ -3952,14 +3951,16 @@ QString QString::fromLocal8Bit(const char *str, int size) /*! Returns a QString initialized with the first \a size characters - of the 8-bit ASCII string \a str. + of the 8-bit string \a str. If \a size is -1 (default), it is taken to be qstrlen(\a str). - If a codec has been set using QTextCodec::setCodecForCStrings(), - it is used to convert \a str to Unicode; otherwise this function - does the same as fromLatin1(). + Note that, despite the name, this function actually uses the codec + defined by QTextCodec::setCodecForCStrings() to convert \a str to + Unicode. Depending on the codec, it may not accept valid US-ASCII (ANSI + X3.4-1986) input. If no codec has been set, this function does the same + as fromLatin1(). \sa toAscii(), fromLatin1(), fromUtf8(), fromLocal8Bit() */ @@ -3975,6 +3976,18 @@ QString QString::fromAscii(const char *str, int size) If \a size is -1 (default), it is taken to be qstrlen(\a str). + UTF-8 is a Unicode codec and can represent all characters in a Unicode + string like QString. However, invalid sequences are possible with UTF-8 + and, if any such are found, they will be replaced with one or more + "replacement characters", or suppressed. These include non-Unicode + sequences, non-characters, overlong sequences or surrogate codepoints + encoded into UTF-8. + + Non-characters are codepoints that the Unicode standard reserves and must + not be used in text interchange. They are the last two codepoints in each + Unicode Plane (U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, etc.), as well + as 16 codepoints in the range U+FDD0..U+FDDF, inclusive. + \sa toUtf8(), fromAscii(), fromLatin1(), fromLocal8Bit() */ QString QString::fromUtf8(const char *str, int size) @@ -4399,8 +4412,10 @@ QString& QString::fill(QChar ch, int size) \overload operator+=() - Appends the byte array \a ba to this string. The byte array is - converted to Unicode using the fromAscii() function. + Appends the byte array \a ba to this string. The byte array is converted + to Unicode using the fromAscii() function. If any NUL characters ('\0') + are embedded in the \a ba byte array, they will be included in the + transformation. You can disable this function by defining \c QT_NO_CAST_FROM_ASCII when you compile your applications. This @@ -6176,7 +6191,7 @@ QStringList QString::split(const QRegExp &rx, SplitBehavior behavior) const */ QString QString::normalized(QString::NormalizationForm mode) const { - return normalized(mode, CURRENT_VERSION); + return normalized(mode, UNICODE_DATA_VERSION); } /*! @@ -6258,7 +6273,7 @@ void qt_string_normalize(QString *data, QString::NormalizationForm mode, QChar:: return; QString &s = *data; - if (version != CURRENT_VERSION) { + if (version != UNICODE_DATA_VERSION) { for (int i = 0; i < NumNormalizationCorrections; ++i) { const NormalizationCorrection &n = uc_normalization_corrections[i]; if (n.version > version) { @@ -7085,9 +7100,9 @@ void QString::updateProperties() const This operator is mostly useful to pass a QString to a function that accepts a std::string object. - If the QString contains non-ASCII Unicode characters, using this - operator can lead to loss of information, since the implementation - calls toAscii(). + If the QString contains Unicode characters that the + QTextCodec::codecForCStrings() codec cannot handle, using this operator + can lead to loss of information. This operator is only available if Qt is configured with STL compatibility enabled. @@ -7138,7 +7153,7 @@ QString QString::fromRawData(const QChar *unicode, int size) } /*! \class QLatin1String - \brief The QLatin1String class provides a thin wrapper around an ASCII/Latin-1 encoded string literal. + \brief The QLatin1String class provides a thin wrapper around an US-ASCII/Latin-1 encoded string literal. \ingroup string-processing \reentrant @@ -7225,7 +7240,7 @@ QString QString::fromRawData(const QChar *unicode, int size) \since 4.3 \overload - The \a other const char pointer is converted to a QLatin1String using + The \a other const char pointer is converted to a QString using the QString::fromAscii() function. You can disable this operator by defining \c @@ -7250,7 +7265,7 @@ QString QString::fromRawData(const QChar *unicode, int size) \since 4.3 \overload operator!=() - The \a other const char pointer is converted to a QLatin1String using + The \a other const char pointer is converted to a QString using the QString::fromAscii() function. You can disable this operator by defining \c @@ -7276,7 +7291,7 @@ QString QString::fromRawData(const QChar *unicode, int size) \since 4.3 \overload - The \a other const char pointer is converted to a QLatin1String using + The \a other const char pointer is converted to a QString using the QString::fromAscii() function. You can disable this operator by defining \c QT_NO_CAST_FROM_ASCII @@ -7302,7 +7317,7 @@ QString QString::fromRawData(const QChar *unicode, int size) \since 4.3 \overload - The \a other const char pointer is converted to a QLatin1String using + The \a other const char pointer is converted to a QString using the QString::fromAscii() function. You can disable this operator by defining \c @@ -7328,7 +7343,7 @@ QString QString::fromRawData(const QChar *unicode, int size) \since 4.3 \overload - The \a other const char pointer is converted to a QLatin1String using + The \a other const char pointer is converted to a QString using the QString::fromAscii() function. You can disable this operator by defining \c |