summaryrefslogtreecommitdiffstats
path: root/src/corelib/tools/qstring.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/corelib/tools/qstring.cpp')
-rw-r--r--src/corelib/tools/qstring.cpp177
1 files changed, 96 insertions, 81 deletions
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp
index eff87e8..5ed3db5 100644
--- a/src/corelib/tools/qstring.cpp
+++ b/src/corelib/tools/qstring.cpp
@@ -335,7 +335,7 @@ const QString::Null QString::null = { };
\macro QT_NO_CAST_TO_ASCII
\relates QString
- disables automatic conversion from QString to ASCII 8-bit strings (char *)
+ disables automatic conversion from QString to 8-bit strings (char *)
\sa QT_NO_CAST_FROM_ASCII, QT_NO_CAST_FROM_BYTEARRAY
*/
@@ -391,10 +391,10 @@ const QString::Null QString::null = { };
with code values above 65535 are stored using surrogate pairs,
i.e., two consecutive \l{QChar}s.)
- \l{Unicode} is an international standard that supports most of
- the writing systems in use today. It is a superset of ASCII and
- Latin-1 (ISO 8859-1), and all the ASCII/Latin-1 characters are
- available at the same code positions.
+ \l{Unicode} is an international standard that supports most of the
+ writing systems in use today. It is a superset of US-ASCII (ANSI
+ X3.4-1986) and Latin-1 (ISO 8859-1), and all the US-ASCII/Latin-1
+ characters are available at the same code positions.
Behind the scenes, QString uses \l{implicit sharing}
(copy-on-write) to reduce memory usage and to avoid the needless
@@ -562,11 +562,13 @@ const QString::Null QString::null = { };
toLatin1(), toUtf8(), and toLocal8Bit().
\list
- \o toAscii() returns an ASCII encoded 8-bit string.
+ \o toAscii() returns an 8-bit string encoded using the codec
+ specified by QTextCodec::codecForCStrings (by default, that is
+ Latin 1).
\o toLatin1() returns a Latin-1 (ISO 8859-1) encoded 8-bit string.
\o toUtf8() returns a UTF-8 encoded 8-bit string. UTF-8 is a
- superset of ASCII that supports the entire Unicode character
- set through multibyte sequences.
+ superset of US-ASCII (ANSI X3.4-1986) that supports the entire
+ Unicode character set through multibyte sequences.
\o toLocal8Bit() returns an 8-bit string using the system's local
encoding.
\endlist
@@ -578,7 +580,7 @@ const QString::Null QString::null = { };
As mentioned above, QString provides a lot of functions and
operators that make it easy to interoperate with \c{const char *}
strings. But this functionality is a double-edged sword: It makes
- QString more convenient to use if all strings are ASCII or
+ QString more convenient to use if all strings are US-ASCII or
Latin-1, but there is always the risk that an implicit conversion
from or to \c{const char *} is done using the wrong 8-bit
encoding. To minimize these risks, you can turn off these implicit
@@ -586,9 +588,9 @@ const QString::Null QString::null = { };
\list
\o \c QT_NO_CAST_FROM_ASCII disables automatic conversions from
- ASCII to Unicode.
+ C string literals and pointers to Unicode.
\o \c QT_NO_CAST_TO_ASCII disables automatic conversion from QString
- to ASCII.
+ to C strings.
\endlist
One way to define these preprocessor symbols globally for your
@@ -837,7 +839,7 @@ int QString::grow(int size)
/*! \fn QString::QString(const char *str)
- Constructs a string initialized with the ASCII string \a str. The
+ Constructs a string initialized with the 8-bit string \a str. The
given const char pointer is converted to Unicode using the
fromAscii() function.
@@ -1337,8 +1339,9 @@ QString &QString::operator=(const QString &other)
\overload operator=()
- Assigns \a ba to this string. The byte array is converted to
- Unicode using the fromAscii() function.
+ Assigns \a ba to this string. The byte array is converted to Unicode
+ using the fromAscii() function. This function stops conversion at the
+ first NUL character found, or the end of the \a ba byte array.
You can disable this operator by defining \c
QT_NO_CAST_FROM_ASCII when you compile your applications. This
@@ -2131,7 +2134,8 @@ bool QString::operator==(const QLatin1String &other) const
\overload operator==()
The \a other byte array is converted to a QString using the
- fromAscii() function.
+ fromAscii() function. This function stops conversion at the
+ first NUL character found, or the end of the \a ba byte array.
You can disable this operator by defining \c
QT_NO_CAST_FROM_ASCII when you compile your applications. This
@@ -2192,7 +2196,8 @@ bool QString::operator<(const QLatin1String &other) const
\overload operator<()
The \a other byte array is converted to a QString using the
- fromAscii() function.
+ fromAscii() function. If any NUL characters ('\0') are embedded
+ in the \a ba byte array, they will be included in the transformation.
You can disable this operator by defining \c
QT_NO_CAST_FROM_ASCII when you compile your applications. This
@@ -2234,7 +2239,8 @@ bool QString::operator<(const QLatin1String &other) const
\overload operator<=()
The \a other byte array is converted to a QString using the
- fromAscii() function.
+ fromAscii() function. If any NUL characters ('\0') are embedded
+ in the \a ba byte array, they will be included in the transformation.
You can disable this operator by defining \c
QT_NO_CAST_FROM_ASCII when you compile your applications. This
@@ -2292,7 +2298,8 @@ bool QString::operator>(const QLatin1String &other) const
\overload operator>()
The \a other byte array is converted to a QString using the
- fromAscii() function.
+ fromAscii() function. If any NUL characters ('\0') are embedded
+ in the \a ba byte array, they will be included in the transformation.
You can disable this operator by defining \c
QT_NO_CAST_FROM_ASCII when you compile your applications. This
@@ -2334,7 +2341,8 @@ bool QString::operator>(const QLatin1String &other) const
\overload operator>=()
The \a other byte array is converted to a QString using the
- fromAscii() function.
+ fromAscii() function. If any NUL characters ('\0') are embedded
+ in the \a ba byte array, they will be included in the transformation.
You can disable this operator by defining \c QT_NO_CAST_FROM_ASCII
when you compile your applications. This can be useful if you want
@@ -2376,7 +2384,8 @@ bool QString::operator>(const QLatin1String &other) const
\overload operator!=()
The \a other byte array is converted to a QString using the
- fromAscii() function.
+ fromAscii() function. If any NUL characters ('\0') are embedded
+ in the \a ba byte array, they will be included in the transformation.
You can disable this operator by defining \c QT_NO_CAST_FROM_ASCII
when you compile your applications. This can be useful if you want
@@ -3563,8 +3572,10 @@ static QByteArray toLatin1_helper(const QChar *data, int length)
/*!
Returns a Latin-1 representation of the string as a QByteArray.
- The returned byte array is undefined if the string contains
- non-Latin1 characters.
+
+ The returned byte array is undefined if the string contains non-Latin1
+ characters. Those characters may be suppressed or replaced with a
+ question mark.
\sa fromLatin1(), toAscii(), toUtf8(), toLocal8Bit(), QTextCodec
*/
@@ -3578,12 +3589,15 @@ QByteArray QString::toLatin1() const
// isn't necessary in the header. See task 177402.
/*!
- Returns an 8-bit ASCII representation of the string as a QByteArray.
+ Returns an 8-bit representation of the string as a QByteArray.
If a codec has been set using QTextCodec::setCodecForCStrings(),
it is used to convert Unicode to 8-bit char; otherwise this
function does the same as toLatin1().
+ Note that, despite the name, this function does not necessarily return an US-ASCII
+ (ANSI X3.4-1986) string and its result may not be US-ASCII compatible.
+
\sa fromAscii(), toLatin1(), toUtf8(), toLocal8Bit(), QTextCodec
*/
QByteArray QString::toAscii() const
@@ -3611,8 +3625,13 @@ static QByteArray toLocal8Bit_helper(const QChar *data, int length)
QByteArray. The returned byte array is undefined if the string
contains characters not supported by the local 8-bit encoding.
- QTextCodec::codecForLocale() is used to perform the conversion
- from Unicode.
+ QTextCodec::codecForLocale() is used to perform the conversion from
+ Unicode. If the locale encoding could not be determined, this function
+ does the same as toLatin1().
+
+ If this string contains any characters that cannot be encoded in the
+ locale, the returned byte array is undefined. Those characters may be
+ suppressed or replaced by another.
\sa fromLocal8Bit(), toAscii(), toLatin1(), toUtf8(), QTextCodec
*/
@@ -3628,54 +3647,34 @@ QByteArray QString::toLocal8Bit() const
/*!
Returns a UTF-8 representation of the string as a QByteArray.
+ UTF-8 is a Unicode codec and can represent all characters in a Unicode
+ string like QString.
+
+ However, in the Unicode range, there are certain codepoints that are not
+ considered characters. The Unicode standard reserves the last two
+ codepoints in each Unicode Plane (U+FFFE, U+FFFF, U+1FFFE, U+1FFFF,
+ U+2FFFE, etc.), as well as 16 codepoints in the range U+FDD0..U+FDDF,
+ inclusive, as non-characters. If any of those appear in the string, they
+ may be discarded and will not appear in the UTF-8 representation, or they
+ may be replaced by one or more replacement characters.
+
\sa fromUtf8(), toAscii(), toLatin1(), toLocal8Bit(), QTextCodec
*/
QByteArray QString::toUtf8() const
{
- QByteArray ba;
- if (d->size) {
- int l = d->size;
- int rlen = l*3+1;
- ba.resize(rlen);
- uchar *cursor = (uchar*)ba.data();
- const ushort *ch =d->data;
- for (int i=0; i < l; i++) {
- uint u = *ch;
- if (u < 0x80) {
- *cursor++ = (uchar)u;
- } else {
- if (u < 0x0800) {
- *cursor++ = 0xc0 | ((uchar) (u >> 6));
- } else {
- if (QChar(u).isHighSurrogate() && i < l-1) {
- ushort low = ch[1];
- if (QChar(low).isLowSurrogate()) {
- ++ch;
- ++i;
- u = QChar::surrogateToUcs4(u,low);
- }
- }
- if (u > 0xffff) {
- *cursor++ = 0xf0 | ((uchar) (u >> 18));
- *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f);
- } else {
- *cursor++ = 0xe0 | ((uchar) (u >> 12));
- }
- *cursor++ = 0x80 | (((uchar) (u >> 6)) & 0x3f);
- }
- *cursor++ = 0x80 | ((uchar) (u&0x3f));
- }
- ++ch;
- }
- ba.resize(cursor - (uchar*)ba.constData());
- }
- return ba;
+ if (isNull())
+ return QByteArray();
+
+ return QUtf8::convertFromUnicode(constData(), length(), 0);
}
/*!
\since 4.2
- Returns a UCS-4 representation of the string as a QVector<uint>.
+ Returns a UCS-4/UTF-32 representation of the string as a QVector<uint>.
+
+ UCS-4 is a Unicode codec and is lossless. All characters from this string
+ can be encoded in UCS-4.
\sa fromUtf8(), toAscii(), toLatin1(), toLocal8Bit(), QTextCodec, fromUcs4(), toWCharArray()
*/
@@ -3952,14 +3951,16 @@ QString QString::fromLocal8Bit(const char *str, int size)
/*!
Returns a QString initialized with the first \a size characters
- of the 8-bit ASCII string \a str.
+ of the 8-bit string \a str.
If \a size is -1 (default), it is taken to be qstrlen(\a
str).
- If a codec has been set using QTextCodec::setCodecForCStrings(),
- it is used to convert \a str to Unicode; otherwise this function
- does the same as fromLatin1().
+ Note that, despite the name, this function actually uses the codec
+ defined by QTextCodec::setCodecForCStrings() to convert \a str to
+ Unicode. Depending on the codec, it may not accept valid US-ASCII (ANSI
+ X3.4-1986) input. If no codec has been set, this function does the same
+ as fromLatin1().
\sa toAscii(), fromLatin1(), fromUtf8(), fromLocal8Bit()
*/
@@ -3975,6 +3976,18 @@ QString QString::fromAscii(const char *str, int size)
If \a size is -1 (default), it is taken to be qstrlen(\a
str).
+ UTF-8 is a Unicode codec and can represent all characters in a Unicode
+ string like QString. However, invalid sequences are possible with UTF-8
+ and, if any such are found, they will be replaced with one or more
+ "replacement characters", or suppressed. These include non-Unicode
+ sequences, non-characters, overlong sequences or surrogate codepoints
+ encoded into UTF-8.
+
+ Non-characters are codepoints that the Unicode standard reserves and must
+ not be used in text interchange. They are the last two codepoints in each
+ Unicode Plane (U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, etc.), as well
+ as 16 codepoints in the range U+FDD0..U+FDDF, inclusive.
+
\sa toUtf8(), fromAscii(), fromLatin1(), fromLocal8Bit()
*/
QString QString::fromUtf8(const char *str, int size)
@@ -4399,8 +4412,10 @@ QString& QString::fill(QChar ch, int size)
\overload operator+=()
- Appends the byte array \a ba to this string. The byte array is
- converted to Unicode using the fromAscii() function.
+ Appends the byte array \a ba to this string. The byte array is converted
+ to Unicode using the fromAscii() function. If any NUL characters ('\0')
+ are embedded in the \a ba byte array, they will be included in the
+ transformation.
You can disable this function by defining \c
QT_NO_CAST_FROM_ASCII when you compile your applications. This
@@ -6176,7 +6191,7 @@ QStringList QString::split(const QRegExp &rx, SplitBehavior behavior) const
*/
QString QString::normalized(QString::NormalizationForm mode) const
{
- return normalized(mode, CURRENT_VERSION);
+ return normalized(mode, UNICODE_DATA_VERSION);
}
/*!
@@ -6258,7 +6273,7 @@ void qt_string_normalize(QString *data, QString::NormalizationForm mode, QChar::
return;
QString &s = *data;
- if (version != CURRENT_VERSION) {
+ if (version != UNICODE_DATA_VERSION) {
for (int i = 0; i < NumNormalizationCorrections; ++i) {
const NormalizationCorrection &n = uc_normalization_corrections[i];
if (n.version > version) {
@@ -7085,9 +7100,9 @@ void QString::updateProperties() const
This operator is mostly useful to pass a QString to a function
that accepts a std::string object.
- If the QString contains non-ASCII Unicode characters, using this
- operator can lead to loss of information, since the implementation
- calls toAscii().
+ If the QString contains Unicode characters that the
+ QTextCodec::codecForCStrings() codec cannot handle, using this operator
+ can lead to loss of information.
This operator is only available if Qt is configured with STL
compatibility enabled.
@@ -7138,7 +7153,7 @@ QString QString::fromRawData(const QChar *unicode, int size)
}
/*! \class QLatin1String
- \brief The QLatin1String class provides a thin wrapper around an ASCII/Latin-1 encoded string literal.
+ \brief The QLatin1String class provides a thin wrapper around an US-ASCII/Latin-1 encoded string literal.
\ingroup string-processing
\reentrant
@@ -7225,7 +7240,7 @@ QString QString::fromRawData(const QChar *unicode, int size)
\since 4.3
\overload
- The \a other const char pointer is converted to a QLatin1String using
+ The \a other const char pointer is converted to a QString using
the QString::fromAscii() function.
You can disable this operator by defining \c
@@ -7250,7 +7265,7 @@ QString QString::fromRawData(const QChar *unicode, int size)
\since 4.3
\overload operator!=()
- The \a other const char pointer is converted to a QLatin1String using
+ The \a other const char pointer is converted to a QString using
the QString::fromAscii() function.
You can disable this operator by defining \c
@@ -7276,7 +7291,7 @@ QString QString::fromRawData(const QChar *unicode, int size)
\since 4.3
\overload
- The \a other const char pointer is converted to a QLatin1String using
+ The \a other const char pointer is converted to a QString using
the QString::fromAscii() function.
You can disable this operator by defining \c QT_NO_CAST_FROM_ASCII
@@ -7302,7 +7317,7 @@ QString QString::fromRawData(const QChar *unicode, int size)
\since 4.3
\overload
- The \a other const char pointer is converted to a QLatin1String using
+ The \a other const char pointer is converted to a QString using
the QString::fromAscii() function.
You can disable this operator by defining \c
@@ -7328,7 +7343,7 @@ QString QString::fromRawData(const QChar *unicode, int size)
\since 4.3
\overload
- The \a other const char pointer is converted to a QLatin1String using
+ The \a other const char pointer is converted to a QString using
the QString::fromAscii() function.
You can disable this operator by defining \c