summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/corelib/io/qurl.cpp241
-rw-r--r--tests/auto/qurl/tst_qurl.cpp96
2 files changed, 256 insertions, 81 deletions
diff --git a/src/corelib/io/qurl.cpp b/src/corelib/io/qurl.cpp
index 180e9ec..dc27dfb 100644
--- a/src/corelib/io/qurl.cpp
+++ b/src/corelib/io/qurl.cpp
@@ -2901,8 +2901,17 @@ static bool isBidirectionalL(const QChar &ch)
return false;
}
+#ifdef QT_BUILD_INTERNAL
+// export for tst_qurl.cpp
+Q_AUTOTEST_EXPORT QString qt_nameprep(const QString &);
+Q_AUTOTEST_EXPORT bool qt_check_std3rules(const QStringRef &);
+#else
+// non-test build, keep the symbols for ourselves
+static QString qt_nameprep(const QString &);
+static bool qt_check_std3rules(const QStringRef &);
+#endif
-Q_AUTOTEST_EXPORT QString qt_nameprep(const QString &source)
+QString qt_nameprep(const QString &source)
{
QString mapped = source;
@@ -2959,8 +2968,32 @@ Q_AUTOTEST_EXPORT QString qt_nameprep(const QString &source)
return mapped;
}
+bool qt_check_std3rules(const QStringRef &source)
+{
+ int len = source.length();
+ if (len > 63)
+ return false;
+
+ for (int i = 0; i < len; ++i) {
+ register ushort c = source.at(i).unicode();
+ if (c == '-' && (i == 0 || i == len - 1))
+ return false;
-static inline char encodeDigit(uint digit)
+ // verifying the absence of LDH is the same as verifying that
+ // only LDH is present
+ if (c == '-' || (c >= '0' && c <= '9')
+ || (c >= 'A' && c <= 'Z')
+ || (c >= 'a' && c <= 'z'))
+ continue;
+
+ return false;
+ }
+
+ return true;
+}
+
+
+static inline uint encodeDigit(uint digit)
{
return digit + 22 + 75 * (digit < 26);
}
@@ -2977,7 +3010,7 @@ static inline uint adapt(uint delta, uint numpoints, bool firsttime)
return k + (((base - tmin + 1) * delta) / (delta + skew));
}
-static inline void appendEncode(QByteArray* output, uint& delta, uint& bias, uint& b, uint& h)
+static inline void appendEncode(QString* output, uint& delta, uint& bias, uint& b, uint& h)
{
uint qq;
uint k;
@@ -2991,17 +3024,17 @@ static inline void appendEncode(QByteArray* output, uint& delta, uint& bias, uin
t = (k <= bias) ? tmin : (k >= bias + tmax) ? tmax : k - bias;
if (qq < t) break;
- *output += encodeDigit(t + (qq - t) % (base - t));
+ *output += QChar(encodeDigit(t + (qq - t) % (base - t)));
qq = (qq - t) / (base - t);
}
- *output += encodeDigit(qq);
+ *output += QChar(encodeDigit(qq));
bias = adapt(delta, h + 1, h == b);
delta = 0;
++h;
}
-static void toPunycodeHelper(const QChar *s, int ucLength, QByteArray *output)
+static void toPunycodeHelper(const QChar *s, int ucLength, QString *output)
{
uint n = initial_n;
uint delta = 0;
@@ -3010,7 +3043,7 @@ static void toPunycodeHelper(const QChar *s, int ucLength, QByteArray *output)
int outLen = output->length();
output->resize(outLen + ucLength);
- char *d = output->data() + outLen;
+ QChar *d = output->data() + outLen;
bool skipped = false;
// copy all basic code points verbatim to output.
for (uint j = 0; j < (uint) ucLength; ++j) {
@@ -3035,7 +3068,7 @@ static void toPunycodeHelper(const QChar *s, int ucLength, QByteArray *output)
// if basic code points were copied, add the delimiter character.
if (h > 0)
- *output += 0x2d;
+ *output += QChar(0x2d);
// while there are still unprocessed non-basic code points left in
// the input string...
@@ -3083,7 +3116,7 @@ static void toPunycodeHelper(const QChar *s, int ucLength, QByteArray *output)
}
// prepend ACE prefix
- output->insert(outLen, "xn--");
+ output->insert(outLen, QLatin1String("xn--"));
return;
}
@@ -3164,46 +3197,118 @@ static bool qt_is_idn_enabled(const QString &domain)
return equal(tld, len, idn_whitelist[i]);
}
-static QString qt_from_ACE(const QString &domainMC)
+static inline bool isDotDelimiter(ushort uc)
{
+ // IDNA / rfc3490 describes these four delimiters used for
+ // separating labels in unicode international domain
+ // names.
+ return uc == 0x2e || uc == 0x3002 || uc == 0xff0e || uc == 0xff61;
+}
+
+static int nextDotDelimiter(const QString &domain, int from = 0)
+{
+ const QChar *b = domain.unicode();
+ const QChar *ch = b + from;
+ const QChar *e = b + domain.length();
+ while (ch < e) {
+ if (isDotDelimiter(ch->unicode()))
+ break;
+ else
+ ++ch;
+ }
+ return ch - b;
+}
+
+enum AceOperation { ToAceOnly, NormalizeAce };
+static QString qt_ACE_do(const QString &domainMC, AceOperation op)
+{
+ if (domainMC.isEmpty())
+ return domainMC;
QString domain = domainMC.toLower();
- int idx = domain.indexOf(QLatin1Char('.'));
- if (idx != -1) {
- if (!domain.contains(QLatin1String("xn--"))) {
- bool simple = true;
- for (int i = 0; i < domain.size(); ++i) {
- ushort ch = domain.at(i).unicode();
- if (ch > 'z' || ch < '-' || ch == '/' || (ch > '9' && ch < 'A') || (ch > 'Z' && ch < 'a')) {
- simple = false;
- break;
- }
+
+ QString result;
+ result.reserve(domain.length());
+
+ const bool isIdnEnabled = op == NormalizeAce ? qt_is_idn_enabled(domain) : false;
+ int lastIdx = 0;
+ while (1) {
+ int idx = nextDotDelimiter(domain, lastIdx);
+ if (idx == lastIdx)
+ return QString(); // two delimiters in a row -- empty label not allowed
+
+ // RFC 3490 says, about the ToASCII operation:
+ // 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
+ //
+ // (a) Verify the absence of non-LDH ASCII code points; that is, the
+ // absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
+ //
+ // (b) Verify the absence of leading and trailing hyphen-minus; that
+ // is, the absence of U+002D at the beginning and end of the
+ // sequence.
+ // and:
+ // 8. Verify that the number of code points is in the range 1 to 63
+ // inclusive.
+
+ bool simple = true;
+ for (int i = lastIdx; i < idx; ++i) {
+ ushort ch = domain.at(i).unicode();
+ if (ch > 0x7f) {
+ simple = false;
+ break;
}
- if (simple)
- return domain;
}
-
- const bool isIdnEnabled = qt_is_idn_enabled(domain);
- int lastIdx = 0;
- QString result;
- while (1) {
+
+ if (simple && idx > lastIdx + 4) {
+ // ACE form domains are simple, but we can't consider them simple
+ // is this an ACE form?
+ static const ushort acePrefixUtf16[] = { 'x', 'n', '-', '-' };
+ if (memcmp(domain.utf16() + lastIdx, acePrefixUtf16, sizeof acePrefixUtf16) == 0)
+ simple = false;
+ }
+
+ QString aux;
+ QStringRef label;
+ if (simple) {
+ // fastest conversion: this is the common case (non IDN-domains)
+ // just memcpy from source (domain) to destination (result)
+ // there's no need to nameprep since everything is ASCII already
+ int prevLen = result.size();
+ result.resize(prevLen + idx - lastIdx);
+ memcpy(result.data() + prevLen, domain.constData() + lastIdx, (idx - lastIdx) * sizeof(QChar));
+
+ label = QStringRef(&result, prevLen, result.length() - prevLen);
+ } else {
// Nameprep the host. If the labels in the hostname are Punycode
// encoded, we decode them immediately, then nameprep them.
- QByteArray label;
- toPunycodeHelper(domain.constData() + lastIdx, idx - lastIdx, &label);
- result += qt_nameprep(isIdnEnabled ? QUrl::fromPunycode(label) : QString::fromLatin1(label));
- lastIdx = idx + 1;
- if (lastIdx < domain.size() + 1)
- result += QLatin1Char('.');
- else
- break;
- idx = domain.indexOf(QLatin1Char('.'), lastIdx);
- if (idx == -1)
- idx = domain.size();
+ QString tmp = domain.mid(lastIdx, idx - lastIdx);
+ tmp = qt_nameprep(tmp);
+
+ if (isIdnEnabled) {
+ toPunycodeHelper(tmp.constData(), tmp.size(), &aux);
+ label = QStringRef(&aux);
+
+ tmp = QUrl::fromPunycode(aux.toLatin1());
+ if (tmp.isEmpty())
+ return QString();
+ result += tmp;
+ } else {
+ int prevLen = result.size();
+ toPunycodeHelper(tmp.constData(), tmp.size(), &result);
+
+ label = QStringRef(&result, prevLen, result.length() - prevLen);
+ }
}
- return result;
- } else {
- return qt_nameprep(domain);
+
+ if (!qt_check_std3rules(label))
+ return QString();
+
+ lastIdx = idx + 1;
+ if (lastIdx < domain.size() + 1)
+ result += QLatin1Char('.');
+ else
+ break;
}
+ return result;
}
@@ -3246,12 +3351,27 @@ QUrlPrivate::QUrlPrivate(const QUrlPrivate &copy)
QString QUrlPrivate::canonicalHost() const
{
- if (QURL_HASFLAG(stateFlags, HostCanonicalized))
+ if (QURL_HASFLAG(stateFlags, HostCanonicalized) || host.isEmpty())
return host;
QUrlPrivate *that = const_cast<QUrlPrivate *>(this);
QURL_SETFLAG(that->stateFlags, HostCanonicalized);
- that->host = qt_from_ACE(host);
+ if (host.contains(QLatin1Char(':'))) {
+ // This is an IP Literal, use _IPLiteral to validate
+ QByteArray ba = host.toLatin1();
+ if (!ba.startsWith('[')) {
+ // surround the IP Literal with [ ] if it's not already done so
+ ba.reserve(ba.length() + 2);
+ ba.prepend('[');
+ ba.append(']');
+ }
+
+ const char *ptr = ba.constData();
+ if (!_IPLiteral(&ptr))
+ that->host.clear();
+ } else {
+ that->host = qt_ACE_do(host, NormalizeAce);
+ }
return that->host;
}
@@ -3737,7 +3857,10 @@ QByteArray QUrlPrivate::toEncoded(QUrl::FormattingOptions options) const
}
}
- url += QUrl::toAce(host);
+ if (host.startsWith(QLatin1Char('[')))
+ url += host.toLatin1();
+ else
+ url += QUrl::toAce(host);
if (!(options & QUrl::RemovePort) && port != -1) {
url += ':';
url += QString::number(port).toAscii();
@@ -4412,8 +4535,6 @@ void QUrl::setHost(const QString &host)
QURL_UNSETFLAG(d->stateFlags, QUrlPrivate::Validated | QUrlPrivate::Normalized | QUrlPrivate::HostCanonicalized);
d->host = host;
- if (d->host.contains(QLatin1Char(':')))
- d->host = QLatin1Char('[') + d->host + QLatin1Char(']');
}
/*!
@@ -5425,9 +5546,9 @@ QByteArray QUrl::toPercentEncoding(const QString &input, const QByteArray &exclu
*/
QByteArray QUrl::toPunycode(const QString &uc)
{
- QByteArray output;
+ QString output;
toPunycodeHelper(uc.constData(), uc.size(), &output);
- return output;
+ return output.toLatin1();
}
/*!
@@ -5528,7 +5649,7 @@ QString QUrl::fromPunycode(const QByteArray &pc)
*/
QString QUrl::fromAce(const QByteArray &domain)
{
- return qt_from_ACE(QString::fromLatin1(domain));
+ return qt_ACE_do(QString::fromLatin1(domain), NormalizeAce);
}
/*!
@@ -5545,26 +5666,8 @@ QString QUrl::fromAce(const QByteArray &domain)
*/
QByteArray QUrl::toAce(const QString &domain)
{
- // IDNA / rfc3490 describes these four delimiters used for
- // separating labels in unicode international domain
- // names.
- QString nameprepped = qt_nameprep(domain);
- int lastIdx = 0;
- QByteArray result;
- for (int i = 0; i < nameprepped.size(); ++i) {
- ushort uc = nameprepped.at(i).unicode();
- if (uc == 0x2e || uc == 0x3002 || uc == 0xff0e || uc == 0xff61) {
- if (lastIdx)
- result += '.';
- toPunycodeHelper(nameprepped.constData() + lastIdx, i - lastIdx, &result);
- lastIdx = i + 1;
- }
- }
- if (lastIdx)
- result += '.';
- toPunycodeHelper(nameprepped.constData() + lastIdx, nameprepped.size() - lastIdx, &result);
-
- return result;
+ QString result = qt_ACE_do(domain, ToAceOnly);
+ return result.toLatin1();
}
/*!
diff --git a/tests/auto/qurl/tst_qurl.cpp b/tests/auto/qurl/tst_qurl.cpp
index 723f882..fcced37 100644
--- a/tests/auto/qurl/tst_qurl.cpp
+++ b/tests/auto/qurl/tst_qurl.cpp
@@ -162,6 +162,8 @@ private slots:
void nameprep_testsuite();
void ace_testsuite_data();
void ace_testsuite();
+ void std3violations_data();
+ void std3violations();
void tldRestrictions_data();
void tldRestrictions();
void emptyQueryOrFragment();
@@ -3100,6 +3102,11 @@ void tst_QUrl::ace_testsuite_data()
QTest::newRow("ascii-upper") << "FLUKE" << "fluke" << "fluke" << "fluke";
QTest::newRow("asciifolded") << QString::fromLatin1("stra\337e") << "strasse" << "." << "strasse";
+ QTest::newRow("asciifolded-dotcom") << QString::fromLatin1("stra\337e.example.com") << "strasse.example.com" << "." << "strasse.example.com";
+ QTest::newRow("greek-mu") << QString::fromLatin1("\265V")
+ <<"xn--v-lmb"
+ << "."
+ << QString::fromUtf8("\316\274v");
QTest::newRow("non-ascii-lower") << QString::fromLatin1("alqualond\353")
<< "xn--alqualond-34a"
@@ -3132,6 +3139,9 @@ void tst_QUrl::ace_testsuite_data()
QTest::newRow("idn-upper") << "XN--ALQUALOND-34A" << "xn--alqualond-34a"
<< QString::fromLatin1("alqualond\353")
<< QString::fromLatin1("alqualond\353");
+
+ QTest::newRow("separator-3002") << QString::fromUtf8("example\343\200\202com")
+ << "example.com" << "." << "example.com";
}
void tst_QUrl::ace_testsuite()
@@ -3142,23 +3152,85 @@ void tst_QUrl::ace_testsuite()
QFETCH(QString, fromace);
QFETCH(QString, unicode);
- QString domain = in + canonsuffix;
- QCOMPARE(QString::fromLatin1(QUrl::toAce(domain)), toace + canonsuffix);
+ const char *suffix = canonsuffix;
+ if (toace.contains('.'))
+ suffix = 0;
+
+ QString domain = in + suffix;
+ QCOMPARE(QString::fromLatin1(QUrl::toAce(domain)), toace + suffix);
if (fromace != ".")
- QCOMPARE(QUrl::fromAce(domain.toLatin1()), fromace + canonsuffix);
- QCOMPARE(QUrl::fromAce(QUrl::toAce(domain)), unicode + canonsuffix);
+ QCOMPARE(QUrl::fromAce(domain.toLatin1()), fromace + suffix);
+ QCOMPARE(QUrl::fromAce(QUrl::toAce(domain)), unicode + suffix);
- domain = in + ".troll.No";
- QCOMPARE(QString::fromLatin1(QUrl::toAce(domain)), toace + canonsuffix);
+ domain = in + (suffix ? ".troll.No" : "");
+ QCOMPARE(QString::fromLatin1(QUrl::toAce(domain)), toace + suffix);
if (fromace != ".")
- QCOMPARE(QUrl::fromAce(domain.toLatin1()), fromace + canonsuffix);
- QCOMPARE(QUrl::fromAce(QUrl::toAce(domain)), unicode + canonsuffix);
+ QCOMPARE(QUrl::fromAce(domain.toLatin1()), fromace + suffix);
+ QCOMPARE(QUrl::fromAce(QUrl::toAce(domain)), unicode + suffix);
- domain = in + ".troll.NO";
- QCOMPARE(QString::fromLatin1(QUrl::toAce(domain)), toace + canonsuffix);
+ domain = in + (suffix ? ".troll.NO" : "");
+ QCOMPARE(QString::fromLatin1(QUrl::toAce(domain)), toace + suffix);
if (fromace != ".")
- QCOMPARE(QUrl::fromAce(domain.toLatin1()), fromace + canonsuffix);
- QCOMPARE(QUrl::fromAce(QUrl::toAce(domain)), unicode + canonsuffix);
+ QCOMPARE(QUrl::fromAce(domain.toLatin1()), fromace + suffix);
+ QCOMPARE(QUrl::fromAce(QUrl::toAce(domain)), unicode + suffix);
+}
+
+void tst_QUrl::std3violations_data()
+{
+ QTest::addColumn<QString>("source");
+ QTest::addColumn<bool>("validUrl");
+
+ QTest::newRow("too-long") << "this-domain-is-far-too-long-for-its-own-good-and-should-have-been-limited-to-63-chars" << false;
+ QTest::newRow("dash-begin") << "-x-foo" << false;
+ QTest::newRow("dash-end") << "x-foo-" << false;
+ QTest::newRow("dash-begin-end") << "-foo-" << false;
+
+ QTest::newRow("control") << "\033foo" << false;
+ QTest::newRow("bang") << "foo!" << false;
+ QTest::newRow("plus") << "foo+bar" << false;
+ QTest::newRow("dot") << "foo.bar";
+ QTest::newRow("slash") << "foo/bar" << true;
+ QTest::newRow("colon") << "foo:80" << true;
+ QTest::newRow("question") << "foo?bar" << true;
+ QTest::newRow("at") << "foo@bar" << true;
+ QTest::newRow("backslash") << "foo\\bar" << false;
+ QTest::newRow("underline") << "foo_bar" << false;
+
+ // these characters are transformed by NFKC to non-LDH characters
+ QTest::newRow("dot-like") << QString::fromUtf8("foo\342\200\244bar") << false; // U+2024 ONE DOT LEADER
+ QTest::newRow("slash-like") << QString::fromUtf8("foo\357\274\217bar") << false; // U+FF0F FULLWIDTH SOLIDUS
+
+ // The following should be invalid but isn't
+ // the DIVISON SLASH doesn't case-fold to a slash
+ // is this a problem with RFC 3490?
+ //QTest::newRow("slash-like2") << QString::fromUtf8("foo\342\210\225bar") << false; // U+2215 DIVISION SLASH
+}
+
+void tst_QUrl::std3violations()
+{
+ QFETCH(QString, source);
+
+ extern QString qt_nameprep(const QString &);
+ extern bool qt_check_std3rules(const QStringRef &);
+
+ {
+ QString prepped = qt_nameprep(source);
+ QVERIFY(!qt_check_std3rules(QStringRef(&prepped)));
+ }
+
+ if (source.contains('.'))
+ return; // this test ends here
+
+ QUrl url;
+ url.setHost(source);
+ QVERIFY(url.host().isEmpty());
+
+ QFETCH(bool, validUrl);
+ if (validUrl)
+ return; // test ends here for these cases
+
+ url = QUrl("http://" + source + "/some/path");
+ QVERIFY(!url.isValid());
}
void tst_QUrl::tldRestrictions_data()