1 files changed, 96 insertions, 81 deletions
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp
index eff87e8..5ed3db5 100644
--- a/src/corelib/tools/qstring.cpp
+++ b/src/corelib/tools/qstring.cpp
@@ -335,7 +335,7 @@ const QString::Null QString::null = { };
   \macro QT_NO_CAST_TO_ASCII
   \relates QString
 
-  disables automatic conversion from QString to ASCII 8-bit strings (char *)
+  disables automatic conversion from QString to 8-bit strings (char *)
 
   \sa QT_NO_CAST_FROM_ASCII, QT_NO_CAST_FROM_BYTEARRAY
 */
@@ -391,10 +391,10 @@ const QString::Null QString::null = { };
     with code values above 65535 are stored using surrogate pairs,
     i.e., two consecutive \l{QChar}s.)
 
-    \l{Unicode} is an international standard that supports most of
-    the writing systems in use today. It is a superset of ASCII and
-    Latin-1 (ISO 8859-1), and all the ASCII/Latin-1 characters are
-    available at the same code positions.
+    \l{Unicode} is an international standard that supports most of the
+    writing systems in use today. It is a superset of US-ASCII (ANSI
+    X3.4-1986) and Latin-1 (ISO 8859-1), and all the US-ASCII/Latin-1
+    characters are available at the same code positions.
 
     Behind the scenes, QString uses \l{implicit sharing}
     (copy-on-write) to reduce memory usage and to avoid the needless
@@ -562,11 +562,13 @@ const QString::Null QString::null = { };
     toLatin1(), toUtf8(), and toLocal8Bit().
 
     \list
-    \o toAscii() returns an ASCII encoded 8-bit string.
+    \o toAscii() returns an 8-bit string encoded using the codec
+       specified by QTextCodec::codecForCStrings (by default, that is
+       Latin 1).
     \o toLatin1() returns a Latin-1 (ISO 8859-1) encoded 8-bit string.
     \o toUtf8() returns a UTF-8 encoded 8-bit string. UTF-8 is a
-       superset of ASCII that supports the entire Unicode character
-       set through multibyte sequences.
+       superset of US-ASCII (ANSI X3.4-1986) that supports the entire
+       Unicode character set through multibyte sequences.
     \o toLocal8Bit() returns an 8-bit string using the system's local
        encoding.
     \endlist
@@ -578,7 +580,7 @@ const QString::Null QString::null = { };
     As mentioned above, QString provides a lot of functions and
     operators that make it easy to interoperate with \c{const char *}
     strings. But this functionality is a double-edged sword: It makes
-    QString more convenient to use if all strings are ASCII or
+    QString more convenient to use if all strings are US-ASCII or
     Latin-1, but there is always the risk that an implicit conversion
     from or to \c{const char *} is done using the wrong 8-bit
     encoding. To minimize these risks, you can turn off these implicit
@@ -586,9 +588,9 @@ const QString::Null QString::null = { };
 
     \list
     \o \c QT_NO_CAST_FROM_ASCII disables automatic conversions from
-       ASCII to Unicode.
+       C string literals and pointers to Unicode.
     \o \c QT_NO_CAST_TO_ASCII disables automatic conversion from QString
-       to ASCII.
+       to C strings.
     \endlist
 
     One way to define these preprocessor symbols globally for your
@@ -837,7 +839,7 @@ int QString::grow(int size)
 
 /*! \fn QString::QString(const char *str)
 
-    Constructs a string initialized with the ASCII string \a str. The
+    Constructs a string initialized with the 8-bit string \a str. The
     given const char pointer is converted to Unicode using the
     fromAscii() function.
 
@@ -1337,8 +1339,9 @@ QString &QString::operator=(const QString &other)
 
     \overload operator=()
 
-    Assigns \a ba to this string. The byte array is converted to
-    Unicode using the fromAscii() function.
+    Assigns \a ba to this string. The byte array is converted to Unicode
+    using the fromAscii() function. This function stops conversion at the
+    first NUL character found, or the end of the \a ba byte array.
 
     You can disable this operator by defining \c
     QT_NO_CAST_FROM_ASCII when you compile your applications. This
@@ -2131,7 +2134,8 @@ bool QString::operator==(const QLatin1String &other) const
     \overload operator==()
 
     The \a other byte array is converted to a QString using the
-    fromAscii() function.
+    fromAscii() function. This function stops conversion at the
+    first NUL character found, or the end of the \a ba byte array.
 
     You can disable this operator by defining \c
     QT_NO_CAST_FROM_ASCII when you compile your applications. This
@@ -2192,7 +2196,8 @@ bool QString::operator<(const QLatin1String &other) const
     \overload operator<()
 
     The \a other byte array is converted to a QString using the
-    fromAscii() function.
+    fromAscii() function. If any NUL characters ('\0') are embedded
+    in the \a ba byte array, they will be included in the transformation.
 
     You can disable this operator by defining \c
     QT_NO_CAST_FROM_ASCII when you compile your applications. This
@@ -2234,7 +2239,8 @@ bool QString::operator<(const QLatin1String &other) const
     \overload operator<=()
 
     The \a other byte array is converted to a QString using the
-    fromAscii() function.
+    fromAscii() function. If any NUL characters ('\0') are embedded
+    in the \a ba byte array, they will be included in the transformation.
 
     You can disable this operator by defining \c
     QT_NO_CAST_FROM_ASCII when you compile your applications. This
@@ -2292,7 +2298,8 @@ bool QString::operator>(const QLatin1String &other) const
     \overload operator>()
 
     The \a other byte array is converted to a QString using the
-    fromAscii() function.
+    fromAscii() function. If any NUL characters ('\0') are embedded
+    in the \a ba byte array, they will be included in the transformation.
 
     You can disable this operator by defining \c
     QT_NO_CAST_FROM_ASCII when you compile your applications. This
@@ -2334,7 +2341,8 @@ bool QString::operator>(const QLatin1String &other) const
     \overload operator>=()
 
     The \a other byte array is converted to a QString using the
-    fromAscii() function.
+    fromAscii() function. If any NUL characters ('\0') are embedded
+    in the \a ba byte array, they will be included in the transformation.
 
     You can disable this operator by defining \c QT_NO_CAST_FROM_ASCII
     when you compile your applications. This can be useful if you want
@@ -2376,7 +2384,8 @@ bool QString::operator>(const QLatin1String &other) const
     \overload operator!=()
 
     The \a other byte array is converted to a QString using the
-    fromAscii() function.
+    fromAscii() function. If any NUL characters ('\0') are embedded
+    in the \a ba byte array, they will be included in the transformation.
 
     You can disable this operator by defining \c QT_NO_CAST_FROM_ASCII
     when you compile your applications. This can be useful if you want
@@ -3563,8 +3572,10 @@ static QByteArray toLatin1_helper(const QChar *data, int length)
 
 /*!
     Returns a Latin-1 representation of the string as a QByteArray.
-    The returned byte array is undefined if the string contains
-    non-Latin1 characters.
+
+    The returned byte array is undefined if the string contains non-Latin1
+    characters. Those characters may be suppressed or replaced with a
+    question mark.
 
     \sa fromLatin1(), toAscii(), toUtf8(), toLocal8Bit(), QTextCodec
 */
@@ -3578,12 +3589,15 @@ QByteArray QString::toLatin1() const
 // isn't necessary in the header. See task 177402.
 
 /*!
-    Returns an 8-bit ASCII representation of the string as a QByteArray.
+    Returns an 8-bit representation of the string as a QByteArray.
 
     If a codec has been set using QTextCodec::setCodecForCStrings(),
     it is used to convert Unicode to 8-bit char; otherwise this
     function does the same as toLatin1().
 
+    Note that, despite the name, this function does not necessarily return an US-ASCII
+    (ANSI X3.4-1986) string and its result may not be US-ASCII compatible.
+
     \sa fromAscii(), toLatin1(), toUtf8(), toLocal8Bit(), QTextCodec
 */
 QByteArray QString::toAscii() const
@@ -3611,8 +3625,13 @@ static QByteArray toLocal8Bit_helper(const QChar *data, int length)
     QByteArray. The returned byte array is undefined if the string
     contains characters not supported by the local 8-bit encoding.
 
-    QTextCodec::codecForLocale() is used to perform the conversion
-    from Unicode.
+    QTextCodec::codecForLocale() is used to perform the conversion from
+    Unicode. If the locale encoding could not be determined, this function
+    does the same as toLatin1().
+
+    If this string contains any characters that cannot be encoded in the
+    locale, the returned byte array is undefined. Those characters may be
+    suppressed or replaced by another.
 
     \sa fromLocal8Bit(), toAscii(), toLatin1(), toUtf8(), QTextCodec
 */
@@ -3628,54 +3647,34 @@ QByteArray QString::toLocal8Bit() const
 /*!
     Returns a UTF-8 representation of the string as a QByteArray.
 
+    UTF-8 is a Unicode codec and can represent all characters in a Unicode
+    string like QString.
+
+    However, in the Unicode range, there are certain codepoints that are not
+    considered characters. The Unicode standard reserves the last two
+    codepoints in each Unicode Plane (U+FFFE, U+FFFF, U+1FFFE, U+1FFFF,
+    U+2FFFE, etc.), as well as 16 codepoints in the range U+FDD0..U+FDDF,
+    inclusive, as non-characters. If any of those appear in the string, they
+    may be discarded and will not appear in the UTF-8 representation, or they
+    may be replaced by one or more replacement characters.
+
     \sa fromUtf8(), toAscii(), toLatin1(), toLocal8Bit(), QTextCodec
 */
 QByteArray QString::toUtf8() const
 {
-    QByteArray ba;
-    if (d->size) {
-        int l = d->size;
-        int rlen = l*3+1;
-        ba.resize(rlen);
-        uchar *cursor = (uchar*)ba.data();
-        const ushort *ch =d->data;
-        for (int i=0; i < l; i++) {
-            uint u = *ch;
-            if (u < 0x80) {
-                *cursor++ = (uchar)u;
-            } else {
-                if (u < 0x0800) {
-                    *cursor++ = 0xc0 | ((uchar) (u >> 6));
-                } else {
-                    if (QChar(u).isHighSurrogate() && i < l-1) {
-                        ushort low = ch[1];
-                        if (QChar(low).isLowSurrogate()) {
-                            ++ch;
-                            ++i;
-                            u = QChar::surrogateToUcs4(u,low);
-                        }
-                    }
-                    if (u > 0xffff) {
-                        *cursor++ = 0xf0 | ((uchar) (u >> 18));
-                        *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f);
-                    } else {
-                        *cursor++ = 0xe0 | ((uchar) (u >> 12));
-                    }
-                    *cursor++ = 0x80 | (((uchar) (u >> 6)) & 0x3f);
-                }
-                *cursor++ = 0x80 | ((uchar) (u&0x3f));
-            }
-            ++ch;
-        }
-        ba.resize(cursor - (uchar*)ba.constData());
-    }
-    return ba;
+    if (isNull())
+        return QByteArray();
+
+    return QUtf8::convertFromUnicode(constData(), length(), 0);
 }
 
 /*!
     \since 4.2
 
-    Returns a UCS-4 representation of the string as a QVector<uint>.
+    Returns a UCS-4/UTF-32 representation of the string as a QVector<uint>.
+
+    UCS-4 is a Unicode codec and is lossless. All characters from this string
+    can be encoded in UCS-4.
 
     \sa fromUtf8(), toAscii(), toLatin1(), toLocal8Bit(), QTextCodec, fromUcs4(), toWCharArray()
 */
@@ -3952,14 +3951,16 @@ QString QString::fromLocal8Bit(const char *str, int size)
 
 /*!
     Returns a QString initialized with the first \a size characters
-    of the 8-bit ASCII string \a str.
+    of the 8-bit string \a str.
 
     If \a size is -1 (default), it is taken to be qstrlen(\a
     str).
 
-    If a codec has been set using QTextCodec::setCodecForCStrings(),
-    it is used to convert \a str to Unicode; otherwise this function
-    does the same as fromLatin1().
+    Note that, despite the name, this function actually uses the codec
+    defined by QTextCodec::setCodecForCStrings() to convert \a str to
+    Unicode. Depending on the codec, it may not accept valid US-ASCII (ANSI
+    X3.4-1986) input. If no codec has been set, this function does the same
+    as fromLatin1().
 
     \sa toAscii(), fromLatin1(), fromUtf8(), fromLocal8Bit()
 */
@@ -3975,6 +3976,18 @@ QString QString::fromAscii(const char *str, int size)
     If \a size is -1 (default), it is taken to be qstrlen(\a
     str).
 
+    UTF-8 is a Unicode codec and can represent all characters in a Unicode
+    string like QString. However, invalid sequences are possible with UTF-8
+    and, if any such are found, they will be replaced with one or more
+    "replacement characters", or suppressed. These include non-Unicode
+    sequences, non-characters, overlong sequences or surrogate codepoints
+    encoded into UTF-8.
+
+    Non-characters are codepoints that the Unicode standard reserves and must
+    not be used in text interchange. They are the last two codepoints in each
+    Unicode Plane (U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, etc.), as well
+    as 16 codepoints in the range U+FDD0..U+FDDF, inclusive.
+
     \sa toUtf8(), fromAscii(), fromLatin1(), fromLocal8Bit()
 */
 QString QString::fromUtf8(const char *str, int size)
@@ -4399,8 +4412,10 @@ QString& QString::fill(QChar ch, int size)
 
     \overload operator+=()
 
-    Appends the byte array \a ba to this string. The byte array is
-    converted to Unicode using the fromAscii() function.
+    Appends the byte array \a ba to this string. The byte array is converted
+    to Unicode using the fromAscii() function. If any NUL characters ('\0')
+    are embedded in the \a ba byte array, they will be included in the
+    transformation.
 
     You can disable this function by defining \c
     QT_NO_CAST_FROM_ASCII when you compile your applications. This
@@ -6176,7 +6191,7 @@ QStringList QString::split(const QRegExp &rx, SplitBehavior behavior) const
 */
 QString QString::normalized(QString::NormalizationForm mode) const
 {
-    return normalized(mode, CURRENT_VERSION);
+    return normalized(mode, UNICODE_DATA_VERSION);
 }
 
 /*!
@@ -6258,7 +6273,7 @@ void qt_string_normalize(QString *data, QString::NormalizationForm mode, QChar::
         return;
 
     QString &s = *data;
-    if (version != CURRENT_VERSION) {
+    if (version != UNICODE_DATA_VERSION) {
         for (int i = 0; i < NumNormalizationCorrections; ++i) {
             const NormalizationCorrection &n = uc_normalization_corrections[i];
             if (n.version > version) {
@@ -7085,9 +7100,9 @@ void QString::updateProperties() const
     This operator is mostly useful to pass a QString to a function
     that accepts a std::string object.
 
-    If the QString contains non-ASCII Unicode characters, using this
-    operator can lead to loss of information, since the implementation
-    calls toAscii().
+    If the QString contains Unicode characters that the
+    QTextCodec::codecForCStrings() codec cannot handle, using this operator
+    can lead to loss of information.
 
     This operator is only available if Qt is configured with STL
     compatibility enabled.
@@ -7138,7 +7153,7 @@ QString QString::fromRawData(const QChar *unicode, int size)
 }
 
 /*! \class QLatin1String
-    \brief The QLatin1String class provides a thin wrapper around an ASCII/Latin-1 encoded string literal.
+    \brief The QLatin1String class provides a thin wrapper around an US-ASCII/Latin-1 encoded string literal.
 
     \ingroup string-processing
     \reentrant
@@ -7225,7 +7240,7 @@ QString QString::fromRawData(const QChar *unicode, int size)
     \since 4.3
     \overload
 
-    The \a other const char pointer is converted to a QLatin1String using
+    The \a other const char pointer is converted to a QString using
     the QString::fromAscii() function.
 
     You can disable this operator by defining \c
@@ -7250,7 +7265,7 @@ QString QString::fromRawData(const QChar *unicode, int size)
     \since 4.3
     \overload operator!=()
 
-    The \a other const char pointer is converted to a QLatin1String using
+    The \a other const char pointer is converted to a QString using
     the QString::fromAscii() function.
 
     You can disable this operator by defining \c
@@ -7276,7 +7291,7 @@ QString QString::fromRawData(const QChar *unicode, int size)
     \since 4.3
     \overload
 
-    The \a other const char pointer is converted to a QLatin1String using
+    The \a other const char pointer is converted to a QString using
     the QString::fromAscii() function.
 
     You can disable this operator by defining \c QT_NO_CAST_FROM_ASCII
@@ -7302,7 +7317,7 @@ QString QString::fromRawData(const QChar *unicode, int size)
     \since 4.3
     \overload
 
-    The \a other const char pointer is converted to a QLatin1String using
+    The \a other const char pointer is converted to a QString using
     the QString::fromAscii() function.
 
     You can disable this operator by defining \c
@@ -7328,7 +7343,7 @@ QString QString::fromRawData(const QChar *unicode, int size)
     \since 4.3
     \overload
 
-    The \a other const char pointer is converted to a QLatin1String using
+    The \a other const char pointer is converted to a QString using
     the QString::fromAscii() function.
 
     You can disable this operator by defining \c