From b6b12bc6b8296d7e199cab0ece35c9eb9ae7fe64 Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Wed, 18 Mar 2009 14:36:52 +0100 Subject: Implement strict STD3 checking of hostnames in URLs. Made the toPunycodeHelper function write to a QString. Renamed qt_from_ACE to qt_ACE_do to indicate what it actually does. Added the STD3 rules for hostnames, forcing hostnames to have to strictly comply to STD3. Also, execute nameprep in the correct order (before trying to encode to Punycode). Validate hostnames when QUrlPrivate::canonicalHost() called, including validation of IP Literals. Validation of IPv4 is missing. Adapted other functions to use qt_ACE_do, notably QUrl::toAce (avoid code duplication). --- src/corelib/io/qurl.cpp | 241 ++++++++++++++++++++++++++++++------------- tests/auto/qurl/tst_qurl.cpp | 96 ++++++++++++++--- 2 files changed, 256 insertions(+), 81 deletions(-) diff --git a/src/corelib/io/qurl.cpp b/src/corelib/io/qurl.cpp index 180e9ec..dc27dfb 100644 --- a/src/corelib/io/qurl.cpp +++ b/src/corelib/io/qurl.cpp @@ -2901,8 +2901,17 @@ static bool isBidirectionalL(const QChar &ch) return false; } +#ifdef QT_BUILD_INTERNAL +// export for tst_qurl.cpp +Q_AUTOTEST_EXPORT QString qt_nameprep(const QString &); +Q_AUTOTEST_EXPORT bool qt_check_std3rules(const QStringRef &); +#else +// non-test build, keep the symbols for ourselves +static QString qt_nameprep(const QString &); +static bool qt_check_std3rules(const QStringRef &); +#endif -Q_AUTOTEST_EXPORT QString qt_nameprep(const QString &source) +QString qt_nameprep(const QString &source) { QString mapped = source; @@ -2959,8 +2968,32 @@ Q_AUTOTEST_EXPORT QString qt_nameprep(const QString &source) return mapped; } +bool qt_check_std3rules(const QStringRef &source) +{ + int len = source.length(); + if (len > 63) + return false; + + for (int i = 0; i < len; ++i) { + register ushort c = source.at(i).unicode(); + if (c == '-' && (i == 0 || i == len - 1)) + return false; -static inline char encodeDigit(uint digit) + // verifying the absence of LDH is the same as verifying that + // only LDH is present + if (c == '-' || (c >= '0' && c <= '9') + || (c >= 'A' && c <= 'Z') + || (c >= 'a' && c <= 'z')) + continue; + + return false; + } + + return true; +} + + +static inline uint encodeDigit(uint digit) { return digit + 22 + 75 * (digit < 26); } @@ -2977,7 +3010,7 @@ static inline uint adapt(uint delta, uint numpoints, bool firsttime) return k + (((base - tmin + 1) * delta) / (delta + skew)); } -static inline void appendEncode(QByteArray* output, uint& delta, uint& bias, uint& b, uint& h) +static inline void appendEncode(QString* output, uint& delta, uint& bias, uint& b, uint& h) { uint qq; uint k; @@ -2991,17 +3024,17 @@ static inline void appendEncode(QByteArray* output, uint& delta, uint& bias, uin t = (k <= bias) ? tmin : (k >= bias + tmax) ? tmax : k - bias; if (qq < t) break; - *output += encodeDigit(t + (qq - t) % (base - t)); + *output += QChar(encodeDigit(t + (qq - t) % (base - t))); qq = (qq - t) / (base - t); } - *output += encodeDigit(qq); + *output += QChar(encodeDigit(qq)); bias = adapt(delta, h + 1, h == b); delta = 0; ++h; } -static void toPunycodeHelper(const QChar *s, int ucLength, QByteArray *output) +static void toPunycodeHelper(const QChar *s, int ucLength, QString *output) { uint n = initial_n; uint delta = 0; @@ -3010,7 +3043,7 @@ static void toPunycodeHelper(const QChar *s, int ucLength, QByteArray *output) int outLen = output->length(); output->resize(outLen + ucLength); - char *d = output->data() + outLen; + QChar *d = output->data() + outLen; bool skipped = false; // copy all basic code points verbatim to output. for (uint j = 0; j < (uint) ucLength; ++j) { @@ -3035,7 +3068,7 @@ static void toPunycodeHelper(const QChar *s, int ucLength, QByteArray *output) // if basic code points were copied, add the delimiter character. if (h > 0) - *output += 0x2d; + *output += QChar(0x2d); // while there are still unprocessed non-basic code points left in // the input string... @@ -3083,7 +3116,7 @@ static void toPunycodeHelper(const QChar *s, int ucLength, QByteArray *output) } // prepend ACE prefix - output->insert(outLen, "xn--"); + output->insert(outLen, QLatin1String("xn--")); return; } @@ -3164,46 +3197,118 @@ static bool qt_is_idn_enabled(const QString &domain) return equal(tld, len, idn_whitelist[i]); } -static QString qt_from_ACE(const QString &domainMC) +static inline bool isDotDelimiter(ushort uc) { + // IDNA / rfc3490 describes these four delimiters used for + // separating labels in unicode international domain + // names. + return uc == 0x2e || uc == 0x3002 || uc == 0xff0e || uc == 0xff61; +} + +static int nextDotDelimiter(const QString &domain, int from = 0) +{ + const QChar *b = domain.unicode(); + const QChar *ch = b + from; + const QChar *e = b + domain.length(); + while (ch < e) { + if (isDotDelimiter(ch->unicode())) + break; + else + ++ch; + } + return ch - b; +} + +enum AceOperation { ToAceOnly, NormalizeAce }; +static QString qt_ACE_do(const QString &domainMC, AceOperation op) +{ + if (domainMC.isEmpty()) + return domainMC; QString domain = domainMC.toLower(); - int idx = domain.indexOf(QLatin1Char('.')); - if (idx != -1) { - if (!domain.contains(QLatin1String("xn--"))) { - bool simple = true; - for (int i = 0; i < domain.size(); ++i) { - ushort ch = domain.at(i).unicode(); - if (ch > 'z' || ch < '-' || ch == '/' || (ch > '9' && ch < 'A') || (ch > 'Z' && ch < 'a')) { - simple = false; - break; - } + + QString result; + result.reserve(domain.length()); + + const bool isIdnEnabled = op == NormalizeAce ? qt_is_idn_enabled(domain) : false; + int lastIdx = 0; + while (1) { + int idx = nextDotDelimiter(domain, lastIdx); + if (idx == lastIdx) + return QString(); // two delimiters in a row -- empty label not allowed + + // RFC 3490 says, about the ToASCII operation: + // 3. If the UseSTD3ASCIIRules flag is set, then perform these checks: + // + // (a) Verify the absence of non-LDH ASCII code points; that is, the + // absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F. + // + // (b) Verify the absence of leading and trailing hyphen-minus; that + // is, the absence of U+002D at the beginning and end of the + // sequence. + // and: + // 8. Verify that the number of code points is in the range 1 to 63 + // inclusive. + + bool simple = true; + for (int i = lastIdx; i < idx; ++i) { + ushort ch = domain.at(i).unicode(); + if (ch > 0x7f) { + simple = false; + break; } - if (simple) - return domain; } - - const bool isIdnEnabled = qt_is_idn_enabled(domain); - int lastIdx = 0; - QString result; - while (1) { + + if (simple && idx > lastIdx + 4) { + // ACE form domains are simple, but we can't consider them simple + // is this an ACE form? + static const ushort acePrefixUtf16[] = { 'x', 'n', '-', '-' }; + if (memcmp(domain.utf16() + lastIdx, acePrefixUtf16, sizeof acePrefixUtf16) == 0) + simple = false; + } + + QString aux; + QStringRef label; + if (simple) { + // fastest conversion: this is the common case (non IDN-domains) + // just memcpy from source (domain) to destination (result) + // there's no need to nameprep since everything is ASCII already + int prevLen = result.size(); + result.resize(prevLen + idx - lastIdx); + memcpy(result.data() + prevLen, domain.constData() + lastIdx, (idx - lastIdx) * sizeof(QChar)); + + label = QStringRef(&result, prevLen, result.length() - prevLen); + } else { // Nameprep the host. If the labels in the hostname are Punycode // encoded, we decode them immediately, then nameprep them. - QByteArray label; - toPunycodeHelper(domain.constData() + lastIdx, idx - lastIdx, &label); - result += qt_nameprep(isIdnEnabled ? QUrl::fromPunycode(label) : QString::fromLatin1(label)); - lastIdx = idx + 1; - if (lastIdx < domain.size() + 1) - result += QLatin1Char('.'); - else - break; - idx = domain.indexOf(QLatin1Char('.'), lastIdx); - if (idx == -1) - idx = domain.size(); + QString tmp = domain.mid(lastIdx, idx - lastIdx); + tmp = qt_nameprep(tmp); + + if (isIdnEnabled) { + toPunycodeHelper(tmp.constData(), tmp.size(), &aux); + label = QStringRef(&aux); + + tmp = QUrl::fromPunycode(aux.toLatin1()); + if (tmp.isEmpty()) + return QString(); + result += tmp; + } else { + int prevLen = result.size(); + toPunycodeHelper(tmp.constData(), tmp.size(), &result); + + label = QStringRef(&result, prevLen, result.length() - prevLen); + } } - return result; - } else { - return qt_nameprep(domain); + + if (!qt_check_std3rules(label)) + return QString(); + + lastIdx = idx + 1; + if (lastIdx < domain.size() + 1) + result += QLatin1Char('.'); + else + break; } + return result; } @@ -3246,12 +3351,27 @@ QUrlPrivate::QUrlPrivate(const QUrlPrivate ©) QString QUrlPrivate::canonicalHost() const { - if (QURL_HASFLAG(stateFlags, HostCanonicalized)) + if (QURL_HASFLAG(stateFlags, HostCanonicalized) || host.isEmpty()) return host; QUrlPrivate *that = const_cast(this); QURL_SETFLAG(that->stateFlags, HostCanonicalized); - that->host = qt_from_ACE(host); + if (host.contains(QLatin1Char(':'))) { + // This is an IP Literal, use _IPLiteral to validate + QByteArray ba = host.toLatin1(); + if (!ba.startsWith('[')) { + // surround the IP Literal with [ ] if it's not already done so + ba.reserve(ba.length() + 2); + ba.prepend('['); + ba.append(']'); + } + + const char *ptr = ba.constData(); + if (!_IPLiteral(&ptr)) + that->host.clear(); + } else { + that->host = qt_ACE_do(host, NormalizeAce); + } return that->host; } @@ -3737,7 +3857,10 @@ QByteArray QUrlPrivate::toEncoded(QUrl::FormattingOptions options) const } } - url += QUrl::toAce(host); + if (host.startsWith(QLatin1Char('['))) + url += host.toLatin1(); + else + url += QUrl::toAce(host); if (!(options & QUrl::RemovePort) && port != -1) { url += ':'; url += QString::number(port).toAscii(); @@ -4412,8 +4535,6 @@ void QUrl::setHost(const QString &host) QURL_UNSETFLAG(d->stateFlags, QUrlPrivate::Validated | QUrlPrivate::Normalized | QUrlPrivate::HostCanonicalized); d->host = host; - if (d->host.contains(QLatin1Char(':'))) - d->host = QLatin1Char('[') + d->host + QLatin1Char(']'); } /*! @@ -5425,9 +5546,9 @@ QByteArray QUrl::toPercentEncoding(const QString &input, const QByteArray &exclu */ QByteArray QUrl::toPunycode(const QString &uc) { - QByteArray output; + QString output; toPunycodeHelper(uc.constData(), uc.size(), &output); - return output; + return output.toLatin1(); } /*! @@ -5528,7 +5649,7 @@ QString QUrl::fromPunycode(const QByteArray &pc) */ QString QUrl::fromAce(const QByteArray &domain) { - return qt_from_ACE(QString::fromLatin1(domain)); + return qt_ACE_do(QString::fromLatin1(domain), NormalizeAce); } /*! @@ -5545,26 +5666,8 @@ QString QUrl::fromAce(const QByteArray &domain) */ QByteArray QUrl::toAce(const QString &domain) { - // IDNA / rfc3490 describes these four delimiters used for - // separating labels in unicode international domain - // names. - QString nameprepped = qt_nameprep(domain); - int lastIdx = 0; - QByteArray result; - for (int i = 0; i < nameprepped.size(); ++i) { - ushort uc = nameprepped.at(i).unicode(); - if (uc == 0x2e || uc == 0x3002 || uc == 0xff0e || uc == 0xff61) { - if (lastIdx) - result += '.'; - toPunycodeHelper(nameprepped.constData() + lastIdx, i - lastIdx, &result); - lastIdx = i + 1; - } - } - if (lastIdx) - result += '.'; - toPunycodeHelper(nameprepped.constData() + lastIdx, nameprepped.size() - lastIdx, &result); - - return result; + QString result = qt_ACE_do(domain, ToAceOnly); + return result.toLatin1(); } /*! diff --git a/tests/auto/qurl/tst_qurl.cpp b/tests/auto/qurl/tst_qurl.cpp index 723f882..fcced37 100644 --- a/tests/auto/qurl/tst_qurl.cpp +++ b/tests/auto/qurl/tst_qurl.cpp @@ -162,6 +162,8 @@ private slots: void nameprep_testsuite(); void ace_testsuite_data(); void ace_testsuite(); + void std3violations_data(); + void std3violations(); void tldRestrictions_data(); void tldRestrictions(); void emptyQueryOrFragment(); @@ -3100,6 +3102,11 @@ void tst_QUrl::ace_testsuite_data() QTest::newRow("ascii-upper") << "FLUKE" << "fluke" << "fluke" << "fluke"; QTest::newRow("asciifolded") << QString::fromLatin1("stra\337e") << "strasse" << "." << "strasse"; + QTest::newRow("asciifolded-dotcom") << QString::fromLatin1("stra\337e.example.com") << "strasse.example.com" << "." << "strasse.example.com"; + QTest::newRow("greek-mu") << QString::fromLatin1("\265V") + <<"xn--v-lmb" + << "." + << QString::fromUtf8("\316\274v"); QTest::newRow("non-ascii-lower") << QString::fromLatin1("alqualond\353") << "xn--alqualond-34a" @@ -3132,6 +3139,9 @@ void tst_QUrl::ace_testsuite_data() QTest::newRow("idn-upper") << "XN--ALQUALOND-34A" << "xn--alqualond-34a" << QString::fromLatin1("alqualond\353") << QString::fromLatin1("alqualond\353"); + + QTest::newRow("separator-3002") << QString::fromUtf8("example\343\200\202com") + << "example.com" << "." << "example.com"; } void tst_QUrl::ace_testsuite() @@ -3142,23 +3152,85 @@ void tst_QUrl::ace_testsuite() QFETCH(QString, fromace); QFETCH(QString, unicode); - QString domain = in + canonsuffix; - QCOMPARE(QString::fromLatin1(QUrl::toAce(domain)), toace + canonsuffix); + const char *suffix = canonsuffix; + if (toace.contains('.')) + suffix = 0; + + QString domain = in + suffix; + QCOMPARE(QString::fromLatin1(QUrl::toAce(domain)), toace + suffix); if (fromace != ".") - QCOMPARE(QUrl::fromAce(domain.toLatin1()), fromace + canonsuffix); - QCOMPARE(QUrl::fromAce(QUrl::toAce(domain)), unicode + canonsuffix); + QCOMPARE(QUrl::fromAce(domain.toLatin1()), fromace + suffix); + QCOMPARE(QUrl::fromAce(QUrl::toAce(domain)), unicode + suffix); - domain = in + ".troll.No"; - QCOMPARE(QString::fromLatin1(QUrl::toAce(domain)), toace + canonsuffix); + domain = in + (suffix ? ".troll.No" : ""); + QCOMPARE(QString::fromLatin1(QUrl::toAce(domain)), toace + suffix); if (fromace != ".") - QCOMPARE(QUrl::fromAce(domain.toLatin1()), fromace + canonsuffix); - QCOMPARE(QUrl::fromAce(QUrl::toAce(domain)), unicode + canonsuffix); + QCOMPARE(QUrl::fromAce(domain.toLatin1()), fromace + suffix); + QCOMPARE(QUrl::fromAce(QUrl::toAce(domain)), unicode + suffix); - domain = in + ".troll.NO"; - QCOMPARE(QString::fromLatin1(QUrl::toAce(domain)), toace + canonsuffix); + domain = in + (suffix ? ".troll.NO" : ""); + QCOMPARE(QString::fromLatin1(QUrl::toAce(domain)), toace + suffix); if (fromace != ".") - QCOMPARE(QUrl::fromAce(domain.toLatin1()), fromace + canonsuffix); - QCOMPARE(QUrl::fromAce(QUrl::toAce(domain)), unicode + canonsuffix); + QCOMPARE(QUrl::fromAce(domain.toLatin1()), fromace + suffix); + QCOMPARE(QUrl::fromAce(QUrl::toAce(domain)), unicode + suffix); +} + +void tst_QUrl::std3violations_data() +{ + QTest::addColumn("source"); + QTest::addColumn("validUrl"); + + QTest::newRow("too-long") << "this-domain-is-far-too-long-for-its-own-good-and-should-have-been-limited-to-63-chars" << false; + QTest::newRow("dash-begin") << "-x-foo" << false; + QTest::newRow("dash-end") << "x-foo-" << false; + QTest::newRow("dash-begin-end") << "-foo-" << false; + + QTest::newRow("control") << "\033foo" << false; + QTest::newRow("bang") << "foo!" << false; + QTest::newRow("plus") << "foo+bar" << false; + QTest::newRow("dot") << "foo.bar"; + QTest::newRow("slash") << "foo/bar" << true; + QTest::newRow("colon") << "foo:80" << true; + QTest::newRow("question") << "foo?bar" << true; + QTest::newRow("at") << "foo@bar" << true; + QTest::newRow("backslash") << "foo\\bar" << false; + QTest::newRow("underline") << "foo_bar" << false; + + // these characters are transformed by NFKC to non-LDH characters + QTest::newRow("dot-like") << QString::fromUtf8("foo\342\200\244bar") << false; // U+2024 ONE DOT LEADER + QTest::newRow("slash-like") << QString::fromUtf8("foo\357\274\217bar") << false; // U+FF0F FULLWIDTH SOLIDUS + + // The following should be invalid but isn't + // the DIVISON SLASH doesn't case-fold to a slash + // is this a problem with RFC 3490? + //QTest::newRow("slash-like2") << QString::fromUtf8("foo\342\210\225bar") << false; // U+2215 DIVISION SLASH +} + +void tst_QUrl::std3violations() +{ + QFETCH(QString, source); + + extern QString qt_nameprep(const QString &); + extern bool qt_check_std3rules(const QStringRef &); + + { + QString prepped = qt_nameprep(source); + QVERIFY(!qt_check_std3rules(QStringRef(&prepped))); + } + + if (source.contains('.')) + return; // this test ends here + + QUrl url; + url.setHost(source); + QVERIFY(url.host().isEmpty()); + + QFETCH(bool, validUrl); + if (validUrl) + return; // test ends here for these cases + + url = QUrl("http://" + source + "/some/path"); + QVERIFY(!url.isValid()); } void tst_QUrl::tldRestrictions_data() -- cgit v0.12