diff options
-rw-r--r-- | src/corelib/io/qurl.cpp | 241 | ||||
-rw-r--r-- | tests/auto/qurl/tst_qurl.cpp | 96 |
2 files changed, 256 insertions, 81 deletions
diff --git a/src/corelib/io/qurl.cpp b/src/corelib/io/qurl.cpp index 180e9ec..dc27dfb 100644 --- a/src/corelib/io/qurl.cpp +++ b/src/corelib/io/qurl.cpp @@ -2901,8 +2901,17 @@ static bool isBidirectionalL(const QChar &ch) return false; } +#ifdef QT_BUILD_INTERNAL +// export for tst_qurl.cpp +Q_AUTOTEST_EXPORT QString qt_nameprep(const QString &); +Q_AUTOTEST_EXPORT bool qt_check_std3rules(const QStringRef &); +#else +// non-test build, keep the symbols for ourselves +static QString qt_nameprep(const QString &); +static bool qt_check_std3rules(const QStringRef &); +#endif -Q_AUTOTEST_EXPORT QString qt_nameprep(const QString &source) +QString qt_nameprep(const QString &source) { QString mapped = source; @@ -2959,8 +2968,32 @@ Q_AUTOTEST_EXPORT QString qt_nameprep(const QString &source) return mapped; } +bool qt_check_std3rules(const QStringRef &source) +{ + int len = source.length(); + if (len > 63) + return false; + + for (int i = 0; i < len; ++i) { + register ushort c = source.at(i).unicode(); + if (c == '-' && (i == 0 || i == len - 1)) + return false; -static inline char encodeDigit(uint digit) + // verifying the absence of LDH is the same as verifying that + // only LDH is present + if (c == '-' || (c >= '0' && c <= '9') + || (c >= 'A' && c <= 'Z') + || (c >= 'a' && c <= 'z')) + continue; + + return false; + } + + return true; +} + + +static inline uint encodeDigit(uint digit) { return digit + 22 + 75 * (digit < 26); } @@ -2977,7 +3010,7 @@ static inline uint adapt(uint delta, uint numpoints, bool firsttime) return k + (((base - tmin + 1) * delta) / (delta + skew)); } -static inline void appendEncode(QByteArray* output, uint& delta, uint& bias, uint& b, uint& h) +static inline void appendEncode(QString* output, uint& delta, uint& bias, uint& b, uint& h) { uint qq; uint k; @@ -2991,17 +3024,17 @@ static inline void appendEncode(QByteArray* output, uint& delta, uint& bias, uin t = (k <= bias) ? tmin : (k >= bias + tmax) ? tmax : k - bias; if (qq < t) break; - *output += encodeDigit(t + (qq - t) % (base - t)); + *output += QChar(encodeDigit(t + (qq - t) % (base - t))); qq = (qq - t) / (base - t); } - *output += encodeDigit(qq); + *output += QChar(encodeDigit(qq)); bias = adapt(delta, h + 1, h == b); delta = 0; ++h; } -static void toPunycodeHelper(const QChar *s, int ucLength, QByteArray *output) +static void toPunycodeHelper(const QChar *s, int ucLength, QString *output) { uint n = initial_n; uint delta = 0; @@ -3010,7 +3043,7 @@ static void toPunycodeHelper(const QChar *s, int ucLength, QByteArray *output) int outLen = output->length(); output->resize(outLen + ucLength); - char *d = output->data() + outLen; + QChar *d = output->data() + outLen; bool skipped = false; // copy all basic code points verbatim to output. for (uint j = 0; j < (uint) ucLength; ++j) { @@ -3035,7 +3068,7 @@ static void toPunycodeHelper(const QChar *s, int ucLength, QByteArray *output) // if basic code points were copied, add the delimiter character. if (h > 0) - *output += 0x2d; + *output += QChar(0x2d); // while there are still unprocessed non-basic code points left in // the input string... @@ -3083,7 +3116,7 @@ static void toPunycodeHelper(const QChar *s, int ucLength, QByteArray *output) } // prepend ACE prefix - output->insert(outLen, "xn--"); + output->insert(outLen, QLatin1String("xn--")); return; } @@ -3164,46 +3197,118 @@ static bool qt_is_idn_enabled(const QString &domain) return equal(tld, len, idn_whitelist[i]); } -static QString qt_from_ACE(const QString &domainMC) +static inline bool isDotDelimiter(ushort uc) { + // IDNA / rfc3490 describes these four delimiters used for + // separating labels in unicode international domain + // names. + return uc == 0x2e || uc == 0x3002 || uc == 0xff0e || uc == 0xff61; +} + +static int nextDotDelimiter(const QString &domain, int from = 0) +{ + const QChar *b = domain.unicode(); + const QChar *ch = b + from; + const QChar *e = b + domain.length(); + while (ch < e) { + if (isDotDelimiter(ch->unicode())) + break; + else + ++ch; + } + return ch - b; +} + +enum AceOperation { ToAceOnly, NormalizeAce }; +static QString qt_ACE_do(const QString &domainMC, AceOperation op) +{ + if (domainMC.isEmpty()) + return domainMC; QString domain = domainMC.toLower(); - int idx = domain.indexOf(QLatin1Char('.')); - if (idx != -1) { - if (!domain.contains(QLatin1String("xn--"))) { - bool simple = true; - for (int i = 0; i < domain.size(); ++i) { - ushort ch = domain.at(i).unicode(); - if (ch > 'z' || ch < '-' || ch == '/' || (ch > '9' && ch < 'A') || (ch > 'Z' && ch < 'a')) { - simple = false; - break; - } + + QString result; + result.reserve(domain.length()); + + const bool isIdnEnabled = op == NormalizeAce ? qt_is_idn_enabled(domain) : false; + int lastIdx = 0; + while (1) { + int idx = nextDotDelimiter(domain, lastIdx); + if (idx == lastIdx) + return QString(); // two delimiters in a row -- empty label not allowed + + // RFC 3490 says, about the ToASCII operation: + // 3. If the UseSTD3ASCIIRules flag is set, then perform these checks: + // + // (a) Verify the absence of non-LDH ASCII code points; that is, the + // absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F. + // + // (b) Verify the absence of leading and trailing hyphen-minus; that + // is, the absence of U+002D at the beginning and end of the + // sequence. + // and: + // 8. Verify that the number of code points is in the range 1 to 63 + // inclusive. + + bool simple = true; + for (int i = lastIdx; i < idx; ++i) { + ushort ch = domain.at(i).unicode(); + if (ch > 0x7f) { + simple = false; + break; } - if (simple) - return domain; } - - const bool isIdnEnabled = qt_is_idn_enabled(domain); - int lastIdx = 0; - QString result; - while (1) { + + if (simple && idx > lastIdx + 4) { + // ACE form domains are simple, but we can't consider them simple + // is this an ACE form? + static const ushort acePrefixUtf16[] = { 'x', 'n', '-', '-' }; + if (memcmp(domain.utf16() + lastIdx, acePrefixUtf16, sizeof acePrefixUtf16) == 0) + simple = false; + } + + QString aux; + QStringRef label; + if (simple) { + // fastest conversion: this is the common case (non IDN-domains) + // just memcpy from source (domain) to destination (result) + // there's no need to nameprep since everything is ASCII already + int prevLen = result.size(); + result.resize(prevLen + idx - lastIdx); + memcpy(result.data() + prevLen, domain.constData() + lastIdx, (idx - lastIdx) * sizeof(QChar)); + + label = QStringRef(&result, prevLen, result.length() - prevLen); + } else { // Nameprep the host. If the labels in the hostname are Punycode // encoded, we decode them immediately, then nameprep them. - QByteArray label; - toPunycodeHelper(domain.constData() + lastIdx, idx - lastIdx, &label); - result += qt_nameprep(isIdnEnabled ? QUrl::fromPunycode(label) : QString::fromLatin1(label)); - lastIdx = idx + 1; - if (lastIdx < domain.size() + 1) - result += QLatin1Char('.'); - else - break; - idx = domain.indexOf(QLatin1Char('.'), lastIdx); - if (idx == -1) - idx = domain.size(); + QString tmp = domain.mid(lastIdx, idx - lastIdx); + tmp = qt_nameprep(tmp); + + if (isIdnEnabled) { + toPunycodeHelper(tmp.constData(), tmp.size(), &aux); + label = QStringRef(&aux); + + tmp = QUrl::fromPunycode(aux.toLatin1()); + if (tmp.isEmpty()) + return QString(); + result += tmp; + } else { + int prevLen = result.size(); + toPunycodeHelper(tmp.constData(), tmp.size(), &result); + + label = QStringRef(&result, prevLen, result.length() - prevLen); + } } - return result; - } else { - return qt_nameprep(domain); + + if (!qt_check_std3rules(label)) + return QString(); + + lastIdx = idx + 1; + if (lastIdx < domain.size() + 1) + result += QLatin1Char('.'); + else + break; } + return result; } @@ -3246,12 +3351,27 @@ QUrlPrivate::QUrlPrivate(const QUrlPrivate ©) QString QUrlPrivate::canonicalHost() const { - if (QURL_HASFLAG(stateFlags, HostCanonicalized)) + if (QURL_HASFLAG(stateFlags, HostCanonicalized) || host.isEmpty()) return host; QUrlPrivate *that = const_cast<QUrlPrivate *>(this); QURL_SETFLAG(that->stateFlags, HostCanonicalized); - that->host = qt_from_ACE(host); + if (host.contains(QLatin1Char(':'))) { + // This is an IP Literal, use _IPLiteral to validate + QByteArray ba = host.toLatin1(); + if (!ba.startsWith('[')) { + // surround the IP Literal with [ ] if it's not already done so + ba.reserve(ba.length() + 2); + ba.prepend('['); + ba.append(']'); + } + + const char *ptr = ba.constData(); + if (!_IPLiteral(&ptr)) + that->host.clear(); + } else { + that->host = qt_ACE_do(host, NormalizeAce); + } return that->host; } @@ -3737,7 +3857,10 @@ QByteArray QUrlPrivate::toEncoded(QUrl::FormattingOptions options) const } } - url += QUrl::toAce(host); + if (host.startsWith(QLatin1Char('['))) + url += host.toLatin1(); + else + url += QUrl::toAce(host); if (!(options & QUrl::RemovePort) && port != -1) { url += ':'; url += QString::number(port).toAscii(); @@ -4412,8 +4535,6 @@ void QUrl::setHost(const QString &host) QURL_UNSETFLAG(d->stateFlags, QUrlPrivate::Validated | QUrlPrivate::Normalized | QUrlPrivate::HostCanonicalized); d->host = host; - if (d->host.contains(QLatin1Char(':'))) - d->host = QLatin1Char('[') + d->host + QLatin1Char(']'); } /*! @@ -5425,9 +5546,9 @@ QByteArray QUrl::toPercentEncoding(const QString &input, const QByteArray &exclu */ QByteArray QUrl::toPunycode(const QString &uc) { - QByteArray output; + QString output; toPunycodeHelper(uc.constData(), uc.size(), &output); - return output; + return output.toLatin1(); } /*! @@ -5528,7 +5649,7 @@ QString QUrl::fromPunycode(const QByteArray &pc) */ QString QUrl::fromAce(const QByteArray &domain) { - return qt_from_ACE(QString::fromLatin1(domain)); + return qt_ACE_do(QString::fromLatin1(domain), NormalizeAce); } /*! @@ -5545,26 +5666,8 @@ QString QUrl::fromAce(const QByteArray &domain) */ QByteArray QUrl::toAce(const QString &domain) { - // IDNA / rfc3490 describes these four delimiters used for - // separating labels in unicode international domain - // names. - QString nameprepped = qt_nameprep(domain); - int lastIdx = 0; - QByteArray result; - for (int i = 0; i < nameprepped.size(); ++i) { - ushort uc = nameprepped.at(i).unicode(); - if (uc == 0x2e || uc == 0x3002 || uc == 0xff0e || uc == 0xff61) { - if (lastIdx) - result += '.'; - toPunycodeHelper(nameprepped.constData() + lastIdx, i - lastIdx, &result); - lastIdx = i + 1; - } - } - if (lastIdx) - result += '.'; - toPunycodeHelper(nameprepped.constData() + lastIdx, nameprepped.size() - lastIdx, &result); - - return result; + QString result = qt_ACE_do(domain, ToAceOnly); + return result.toLatin1(); } /*! diff --git a/tests/auto/qurl/tst_qurl.cpp b/tests/auto/qurl/tst_qurl.cpp index 723f882..fcced37 100644 --- a/tests/auto/qurl/tst_qurl.cpp +++ b/tests/auto/qurl/tst_qurl.cpp @@ -162,6 +162,8 @@ private slots: void nameprep_testsuite(); void ace_testsuite_data(); void ace_testsuite(); + void std3violations_data(); + void std3violations(); void tldRestrictions_data(); void tldRestrictions(); void emptyQueryOrFragment(); @@ -3100,6 +3102,11 @@ void tst_QUrl::ace_testsuite_data() QTest::newRow("ascii-upper") << "FLUKE" << "fluke" << "fluke" << "fluke"; QTest::newRow("asciifolded") << QString::fromLatin1("stra\337e") << "strasse" << "." << "strasse"; + QTest::newRow("asciifolded-dotcom") << QString::fromLatin1("stra\337e.example.com") << "strasse.example.com" << "." << "strasse.example.com"; + QTest::newRow("greek-mu") << QString::fromLatin1("\265V") + <<"xn--v-lmb" + << "." + << QString::fromUtf8("\316\274v"); QTest::newRow("non-ascii-lower") << QString::fromLatin1("alqualond\353") << "xn--alqualond-34a" @@ -3132,6 +3139,9 @@ void tst_QUrl::ace_testsuite_data() QTest::newRow("idn-upper") << "XN--ALQUALOND-34A" << "xn--alqualond-34a" << QString::fromLatin1("alqualond\353") << QString::fromLatin1("alqualond\353"); + + QTest::newRow("separator-3002") << QString::fromUtf8("example\343\200\202com") + << "example.com" << "." << "example.com"; } void tst_QUrl::ace_testsuite() @@ -3142,23 +3152,85 @@ void tst_QUrl::ace_testsuite() QFETCH(QString, fromace); QFETCH(QString, unicode); - QString domain = in + canonsuffix; - QCOMPARE(QString::fromLatin1(QUrl::toAce(domain)), toace + canonsuffix); + const char *suffix = canonsuffix; + if (toace.contains('.')) + suffix = 0; + + QString domain = in + suffix; + QCOMPARE(QString::fromLatin1(QUrl::toAce(domain)), toace + suffix); if (fromace != ".") - QCOMPARE(QUrl::fromAce(domain.toLatin1()), fromace + canonsuffix); - QCOMPARE(QUrl::fromAce(QUrl::toAce(domain)), unicode + canonsuffix); + QCOMPARE(QUrl::fromAce(domain.toLatin1()), fromace + suffix); + QCOMPARE(QUrl::fromAce(QUrl::toAce(domain)), unicode + suffix); - domain = in + ".troll.No"; - QCOMPARE(QString::fromLatin1(QUrl::toAce(domain)), toace + canonsuffix); + domain = in + (suffix ? ".troll.No" : ""); + QCOMPARE(QString::fromLatin1(QUrl::toAce(domain)), toace + suffix); if (fromace != ".") - QCOMPARE(QUrl::fromAce(domain.toLatin1()), fromace + canonsuffix); - QCOMPARE(QUrl::fromAce(QUrl::toAce(domain)), unicode + canonsuffix); + QCOMPARE(QUrl::fromAce(domain.toLatin1()), fromace + suffix); + QCOMPARE(QUrl::fromAce(QUrl::toAce(domain)), unicode + suffix); - domain = in + ".troll.NO"; - QCOMPARE(QString::fromLatin1(QUrl::toAce(domain)), toace + canonsuffix); + domain = in + (suffix ? ".troll.NO" : ""); + QCOMPARE(QString::fromLatin1(QUrl::toAce(domain)), toace + suffix); if (fromace != ".") - QCOMPARE(QUrl::fromAce(domain.toLatin1()), fromace + canonsuffix); - QCOMPARE(QUrl::fromAce(QUrl::toAce(domain)), unicode + canonsuffix); + QCOMPARE(QUrl::fromAce(domain.toLatin1()), fromace + suffix); + QCOMPARE(QUrl::fromAce(QUrl::toAce(domain)), unicode + suffix); +} + +void tst_QUrl::std3violations_data() +{ + QTest::addColumn<QString>("source"); + QTest::addColumn<bool>("validUrl"); + + QTest::newRow("too-long") << "this-domain-is-far-too-long-for-its-own-good-and-should-have-been-limited-to-63-chars" << false; + QTest::newRow("dash-begin") << "-x-foo" << false; + QTest::newRow("dash-end") << "x-foo-" << false; + QTest::newRow("dash-begin-end") << "-foo-" << false; + + QTest::newRow("control") << "\033foo" << false; + QTest::newRow("bang") << "foo!" << false; + QTest::newRow("plus") << "foo+bar" << false; + QTest::newRow("dot") << "foo.bar"; + QTest::newRow("slash") << "foo/bar" << true; + QTest::newRow("colon") << "foo:80" << true; + QTest::newRow("question") << "foo?bar" << true; + QTest::newRow("at") << "foo@bar" << true; + QTest::newRow("backslash") << "foo\\bar" << false; + QTest::newRow("underline") << "foo_bar" << false; + + // these characters are transformed by NFKC to non-LDH characters + QTest::newRow("dot-like") << QString::fromUtf8("foo\342\200\244bar") << false; // U+2024 ONE DOT LEADER + QTest::newRow("slash-like") << QString::fromUtf8("foo\357\274\217bar") << false; // U+FF0F FULLWIDTH SOLIDUS + + // The following should be invalid but isn't + // the DIVISON SLASH doesn't case-fold to a slash + // is this a problem with RFC 3490? + //QTest::newRow("slash-like2") << QString::fromUtf8("foo\342\210\225bar") << false; // U+2215 DIVISION SLASH +} + +void tst_QUrl::std3violations() +{ + QFETCH(QString, source); + + extern QString qt_nameprep(const QString &); + extern bool qt_check_std3rules(const QStringRef &); + + { + QString prepped = qt_nameprep(source); + QVERIFY(!qt_check_std3rules(QStringRef(&prepped))); + } + + if (source.contains('.')) + return; // this test ends here + + QUrl url; + url.setHost(source); + QVERIFY(url.host().isEmpty()); + + QFETCH(bool, validUrl); + if (validUrl) + return; // test ends here for these cases + + url = QUrl("http://" + source + "/some/path"); + QVERIFY(!url.isValid()); } void tst_QUrl::tldRestrictions_data() |