From a97862318a6b3f72a05a3e66ebcf5f0e455bb42a Mon Sep 17 00:00:00 2001 From: Lars Knoll Date: Fri, 11 Apr 2008 13:12:02 +0200 Subject: Don't use memcmp it's terribly slow. Added qMemEquals method that returns true if the two memory regions contain the same content. Reviewed-By: Thiago Macieira --- src/corelib/tools/qstring.cpp | 152 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 146 insertions(+), 6 deletions(-) diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp index c3649e3..81b55b7 100644 --- a/src/corelib/tools/qstring.cpp +++ b/src/corelib/tools/qstring.cpp @@ -195,6 +195,142 @@ static int ucstrnicmp(const ushort *a, const ushort *b, int l) return ucstricmp(a, a + l, b, b + l); } +#if 0 +static bool qMemEquals(const quint8 *a, const quint8 *b, int bytes) +{ + if (a == b) + return true; + + // check alignement + qptrdiff align = ((quintptr)a - (quintptr)b); +// if (sizeof(void *) == 8 && !(align & 7)) { +// } else + if (!(align & 3)) { + quintptr pa = (quintptr)a; + if (pa & 3) { + if (pa & 1) { + if (*a != *b) + return false; + ++a; + ++b; + --bytes; + } + if (pa & 2) { + if (*reinterpret_cast(a) != *reinterpret_cast(b)) + return false; + a += 2; + b += 2; + bytes -= 2; + } + } + int tail = (bytes & 3); + const quint32 *sa = reinterpret_cast(a); + const quint32 *sb = reinterpret_cast(b); + const quint32 *e = sa + (bytes >> 2); + while (sa < e) { + if (*sa != *sb) + return false; + ++sa; + ++sb; + } + a = reinterpret_cast(sa); + b = reinterpret_cast(sb); + if (tail) { + if (tail & 2) { + if (*reinterpret_cast(a) != *reinterpret_cast(b)) + return false; + sa += 2; + sb += 2; + } + if (tail & 1) { + if (*a != *b) + return false; + } + } + } else if (!(align & 1)) { + quintptr pa = (quintptr)a; + if (pa & 1) { + if (*a != *b) + return false; + ++a; + ++b; + --bytes; + } + bool tail = (bytes & 1); + const quint16 *sa = reinterpret_cast(a); + const quint16 *sb = reinterpret_cast(b); + const quint16 *e = sa + (bytes >> 1); + while (sa < e) { + if (*sa != *sb) + return false; + ++sa; + ++sb; + } + a = reinterpret_cast(sa); + b = reinterpret_cast(sb); + if (tail) { + if (*a != *b) + return false; + } + } else { + const quint8 *e = a + bytes; + while (a < e) { + if (*a != *b) + return false; + ++a; + ++b; + } + } + return true; +} +#endif + +static bool qMemEquals(const quint16 *a, const quint16 *b, int length) +{ + if (a == b) + return true; + + // check alignement + qptrdiff align = ((quintptr)a - (quintptr)b); + Q_ASSERT(!(align & 1)); +// if (sizeof(void *) == 8 && !(align & 7)) { +// } else + if (!(align & 3)) { + quintptr pa = (quintptr)a; + if (pa & 2) { + if (*a != *b) + return false; + ++a; + ++b; + --length; + } + int tail = (length & 1); + const quint32 *sa = reinterpret_cast(a); + const quint32 *sb = reinterpret_cast(b); + const quint32 *e = sa + (length >> 1); + while (sa < e) { + if (*sa != *sb) + return false; + ++sa; + ++sb; + } + a = reinterpret_cast(sa); + b = reinterpret_cast(sb); + if (tail) { + if (*a != *b) + return false; + } + } else if (!(align & 1)) { + const quint16 *e = a + length; + while (a < e) { + if (*a != *b) + return false; + ++a; + ++b; + } + } + return true; +} /*! \internal @@ -1908,8 +2044,10 @@ QString &QString::replace(QChar c, const QLatin1String &after, Qt::CaseSensitivi */ bool QString::operator==(const QString &other) const { - return (size() == other.size()) && - (memcmp((char*)unicode(),(char*)other.unicode(), size()*sizeof(QChar))==0); + if (d->size != other.d->size) + return false; + + return qMemEquals(d->data, other.d->data, d->size); } /*! @@ -3136,7 +3274,7 @@ bool QString::startsWith(const QString& s, Qt::CaseSensitivity cs) const if (s.d->size > d->size) return false; if (cs == Qt::CaseSensitive) { - return memcmp((char*)d->data, (char*)s.d->data, s.d->size*sizeof(QChar)) == 0; + return qMemEquals(d->data, s.d->data, s.d->size); } else { uint last = 0; uint olast = 0; @@ -3207,7 +3345,7 @@ bool QString::endsWith(const QString& s, Qt::CaseSensitivity cs) const if (pos < 0) return false; if (cs == Qt::CaseSensitive) { - return memcmp((char*)&d->data[pos], (char*)s.d->data, s.d->size*sizeof(QChar)) == 0; + return qMemEquals(d->data + pos, s.d->data, s.d->size); } else { uint last = 0; uint olast = 0; @@ -7692,7 +7830,8 @@ QString QStringRef::toString() const { */ bool operator==(const QStringRef &s1,const QStringRef &s2) { return (s1.size() == s2.size() && - (memcmp((char*)s1.unicode(), (char*)s2.unicode(), s1.size()*sizeof(QChar))==0)); } + qMemEquals((const ushort *)s1.unicode(), (const ushort *)s2.unicode(), s1.size())); +} /*! \relates QStringRef @@ -7701,7 +7840,8 @@ bool operator==(const QStringRef &s1,const QStringRef &s2) */ bool operator==(const QString &s1,const QStringRef &s2) { return (s1.size() == s2.size() && - (memcmp((char*)s1.unicode(), (char*)s2.unicode(), s1.size()*sizeof(QChar))==0)); } + qMemEquals((const ushort *)s1.unicode(), (const ushort *)s2.unicode(), s1.size())); +} /*! \relates QStringRef -- cgit v0.12 From 7c848d093e2ce12528d8af435a6e10bc0e28f1e3 Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Mon, 18 May 2009 23:24:03 +0200 Subject: Add a benchmark for QString::operator== Reviewed-By: Bradley T. Hughes --- tests/benchmarks/qstring/main.cpp | 104 +++++++++++++++++++++++++++++++++++ tests/benchmarks/qstring/qstring.pro | 4 ++ 2 files changed, 108 insertions(+) create mode 100644 tests/benchmarks/qstring/main.cpp create mode 100644 tests/benchmarks/qstring/qstring.pro diff --git a/tests/benchmarks/qstring/main.cpp b/tests/benchmarks/qstring/main.cpp new file mode 100644 index 0000000..c7962bd --- /dev/null +++ b/tests/benchmarks/qstring/main.cpp @@ -0,0 +1,104 @@ +/**************************************************************************** +** +** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). +** Contact: Qt Software Information (qt-info@nokia.com) +** +** This file is part of the test suite of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the either Technology Preview License Agreement or the +** Beta Release License Agreement. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain +** additional rights. These rights are described in the Nokia Qt LGPL +** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this +** package. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 3.0 as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU General Public License version 3.0 requirements will be +** met: http://www.gnu.org/copyleft/gpl.html. +** +** If you are unsure which license is appropriate for your use, please +** contact the sales department at qt-sales@nokia.com. +** $QT_END_LICENSE$ +** +****************************************************************************/ +#include +#include + +class tst_QString: public QObject +{ + Q_OBJECT +private slots: + void equals() const; + void equals_data() const; +}; + +void tst_QString::equals() const +{ + QFETCH(QString, a); + QFETCH(QString, b); + + QBENCHMARK { + a == b; + } +} + +void tst_QString::equals_data() const +{ + static const struct { + ushort data[80]; + int dummy; // just to ensure 4-byte alignment + } data = { + { + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, // 16 + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, // 32 + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, // 48 + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, // 64 + 64, 64, 64, 64, 64, 64, 64, 64, + 96, 96, 96, 96, 96, 96, 96, 96 // 80 + }, 0 + }; + const QChar *ptr = reinterpret_cast(data.data); + + QTest::addColumn("a"); + QTest::addColumn("b"); + QString base = QString::fromRawData(ptr, 64); + + QTest::newRow("different-length") << base << QString::fromRawData(ptr, 4); + QTest::newRow("same-string") << base << base; + QTest::newRow("same-data") << base << QString::fromRawData(ptr, 64); + + // don't use length > 64, since that crosses a cache line + QTest::newRow("aligned-odd") + << QString::fromRawData(ptr, 63) << QString::fromRawData(ptr + 2, 63); + QTest::newRow("aligned-even") + << QString::fromRawData(ptr, 64) << QString::fromRawData(ptr + 2, 64); + QTest::newRow("unaligned-even") + << QString::fromRawData(ptr, 63) << QString::fromRawData(ptr + 1, 63); + QTest::newRow("unaligned-odd") + << QString::fromRawData(ptr, 64) << QString::fromRawData(ptr + 1, 64); +} + +QTEST_MAIN(tst_QString) + +#include "main.moc" diff --git a/tests/benchmarks/qstring/qstring.pro b/tests/benchmarks/qstring/qstring.pro new file mode 100644 index 0000000..74423e7 --- /dev/null +++ b/tests/benchmarks/qstring/qstring.pro @@ -0,0 +1,4 @@ +load(qttest_p4) +TARGET = tst_qstring +QT -= gui +SOURCES += main.cpp -- cgit v0.12 From 2387c77787ec279981d58c46cfb5bde3e1530790 Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Mon, 18 May 2009 23:24:26 +0200 Subject: Optimise QString comparison based on the results from the benchmark Reviewed-By: Bradley T. Hughes --- src/corelib/tools/qstring.cpp | 163 ++++++++++-------------------------------- 1 file changed, 38 insertions(+), 125 deletions(-) diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp index 81b55b7..456fdfd 100644 --- a/src/corelib/tools/qstring.cpp +++ b/src/corelib/tools/qstring.cpp @@ -195,139 +195,52 @@ static int ucstrnicmp(const ushort *a, const ushort *b, int l) return ucstricmp(a, a + l, b, b + l); } -#if 0 -static bool qMemEquals(const quint8 *a, const quint8 *b, int bytes) +static bool qMemEquals(const quint16 *a, const quint16 *b, int length) { + // Benchmarking indicates that doing memcmp is much slower than + // executing the comparison ourselves. + // To make it even faster, we do a 32-bit comparison, comparing + // twice the amount of data as a normal word-by-word comparison. + // + // Benchmarking results on a 2.33 GHz Core2 Duo, with a 64-QChar + // block of data, with 4194304 iterations (per iteration): + // operation usec cpu ticks + // memcmp 330 710 + // 16-bit 135 285-290 + // 32-bit aligned 69.7 135-145 + // + // Testing also indicates that unaligned 32-bit loads are as + // performant as 32-bit aligned. if (a == b) return true; - // check alignement - qptrdiff align = ((quintptr)a - (quintptr)b); -// if (sizeof(void *) == 8 && !(align & 7)) { -// } else - if (!(align & 3)) { - quintptr pa = (quintptr)a; - if (pa & 3) { - if (pa & 1) { - if (*a != *b) - return false; - ++a; - ++b; - --bytes; - } - if (pa & 2) { - if (*reinterpret_cast(a) != *reinterpret_cast(b)) - return false; - a += 2; - b += 2; - bytes -= 2; - } - } - int tail = (bytes & 3); - const quint32 *sa = reinterpret_cast(a); - const quint32 *sb = reinterpret_cast(b); - const quint32 *e = sa + (bytes >> 2); - while (sa < e) { - if (*sa != *sb) - return false; - ++sa; - ++sb; - } - a = reinterpret_cast(sa); - b = reinterpret_cast(sb); - if (tail) { - if (tail & 2) { - if (*reinterpret_cast(a) != *reinterpret_cast(b)) - return false; - sa += 2; - sb += 2; - } - if (tail & 1) { - if (*a != *b) - return false; - } - } - } else if (!(align & 1)) { - quintptr pa = (quintptr)a; - if (pa & 1) { - if (*a != *b) - return false; - ++a; - ++b; - --bytes; - } - bool tail = (bytes & 1); - const quint16 *sa = reinterpret_cast(a); - const quint16 *sb = reinterpret_cast(b); - const quint16 *e = sa + (bytes >> 1); - while (sa < e) { - if (*sa != *sb) - return false; - ++sa; - ++sb; - } - a = reinterpret_cast(sa); - b = reinterpret_cast(sb); - if (tail) { - if (*a != *b) - return false; - } - } else { - const quint8 *e = a + bytes; - while (a < e) { - if (*a != *b) + register union { + const quint16 *w; + const quint32 *d; + quintptr value; + } sa, sb; + sa.w = a; + sb.w = b; + + // check alignment + bool unaligned = (sa.value | sb.value) & 2; +#if defined(__i386__) || defined(__x86_64__) || defined(_M_X64_) + unaligned = false; +#endif + if (!unaligned) { + // both addresses are 4-bytes aligned (or this is an x86) + // do a fast 32-bit comparison + for (register int halfLength = length / 2; halfLength; --halfLength, ++sa.d, ++sb.d) { + if (*sa.d != *sb.d) return false; - ++a; - ++b; } + return length & 1 ? (*sa.w == *sb.w) : true; } - return true; -} -#endif -static bool qMemEquals(const quint16 *a, const quint16 *b, int length) -{ - if (a == b) - return true; - - // check alignement - qptrdiff align = ((quintptr)a - (quintptr)b); - Q_ASSERT(!(align & 1)); -// if (sizeof(void *) == 8 && !(align & 7)) { -// } else - if (!(align & 3)) { - quintptr pa = (quintptr)a; - if (pa & 2) { - if (*a != *b) - return false; - ++a; - ++b; - --length; - } - int tail = (length & 1); - const quint32 *sa = reinterpret_cast(a); - const quint32 *sb = reinterpret_cast(b); - const quint32 *e = sa + (length >> 1); - while (sa < e) { - if (*sa != *sb) - return false; - ++sa; - ++sb; - } - a = reinterpret_cast(sa); - b = reinterpret_cast(sb); - if (tail) { - if (*a != *b) - return false; - } - } else if (!(align & 1)) { - const quint16 *e = a + length; - while (a < e) { - if (*a != *b) - return false; - ++a; - ++b; - } + // one or both of the addresses isn't 2-byte aligned + for ( ; length; --length, ++sa.w, ++sb.w) { + if (*sa.w != *sb.w) + return false; } return true; } -- cgit v0.12 From 1ff30b2b88af31cf18d2a1e6961c3ed7bd9b0240 Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Tue, 19 May 2009 14:38:49 +0200 Subject: Reintroduce the unaligned-unaligned 32-bit code that I had removed out of ignorance. If both pointers are out of 4-byte alignment, doing the first load will align them so we can do 32-bit comparisons. Lars's code had this before, but I misunderstood it and removed, thinking it was doing misaligned accesses. I experimented with moving the tail comparison above the 32-bit comparison to save a register, but it made things worse. Reviewed-By: Bradley T. Hughes --- src/corelib/tools/qstring.cpp | 45 +++++++++++++++++++++++++-------------- tests/benchmarks/qstring/main.cpp | 41 +++++++++++++++++++++++++---------- 2 files changed, 59 insertions(+), 27 deletions(-) diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp index 456fdfd..29509c5 100644 --- a/src/corelib/tools/qstring.cpp +++ b/src/corelib/tools/qstring.cpp @@ -206,12 +206,12 @@ static bool qMemEquals(const quint16 *a, const quint16 *b, int length) // block of data, with 4194304 iterations (per iteration): // operation usec cpu ticks // memcmp 330 710 - // 16-bit 135 285-290 - // 32-bit aligned 69.7 135-145 + // 16-bit 79 167-171 + // 32-bit aligned 49 105-109 // // Testing also indicates that unaligned 32-bit loads are as // performant as 32-bit aligned. - if (a == b) + if (a == b || !length) return true; register union { @@ -223,24 +223,37 @@ static bool qMemEquals(const quint16 *a, const quint16 *b, int length) sb.w = b; // check alignment - bool unaligned = (sa.value | sb.value) & 2; -#if defined(__i386__) || defined(__x86_64__) || defined(_M_X64_) - unaligned = false; -#endif - if (!unaligned) { - // both addresses are 4-bytes aligned (or this is an x86) + if ((sa.value & 2) == (sb.value & 2)) { + // both addresses have the same alignment + if (sa.value & 2) { + // both addresses are not aligned to 4-bytes boundaries + // compare the first character + if (*sa.w != *sb.w) + return false; + --length; + ++sa.w; + ++sb.w; + + // now both addresses are 4-bytes aligned + } + + // both addresses are 4-bytes aligned // do a fast 32-bit comparison - for (register int halfLength = length / 2; halfLength; --halfLength, ++sa.d, ++sb.d) { + register const quint32 *e = sa.d + (length >> 1); + for ( ; sa.d != e; ++sa.d, ++sb.d) { if (*sa.d != *sb.d) return false; } - return length & 1 ? (*sa.w == *sb.w) : true; - } - // one or both of the addresses isn't 2-byte aligned - for ( ; length; --length, ++sa.w, ++sb.w) { - if (*sa.w != *sb.w) - return false; + // do we have a tail? + return (length & 1) ? *sa.w == *sb.w : true; + } else { + // one of the addresses isn't 4-byte aligned but the other is + register const quint16 *e = sa.w + length; + for ( ; sa.w != e; ++sa.w, ++sb.w) { + if (*sa.w != *sb.w) + return false; + } } return true; } diff --git a/tests/benchmarks/qstring/main.cpp b/tests/benchmarks/qstring/main.cpp index c7962bd..cbbf0a1 100644 --- a/tests/benchmarks/qstring/main.cpp +++ b/tests/benchmarks/qstring/main.cpp @@ -74,8 +74,8 @@ void tst_QString::equals_data() const 64, 64, 64, 64, 64, 64, 64, 64, // 48 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, // 64 - 64, 64, 64, 64, 64, 64, 64, 64, - 96, 96, 96, 96, 96, 96, 96, 96 // 80 + 64, 64, 64, 64, 96, 96, 96, 96, + 64, 64, 96, 96, 96, 96, 96, 96 // 80 }, 0 }; const QChar *ptr = reinterpret_cast(data.data); @@ -88,15 +88,34 @@ void tst_QString::equals_data() const QTest::newRow("same-string") << base << base; QTest::newRow("same-data") << base << QString::fromRawData(ptr, 64); - // don't use length > 64, since that crosses a cache line - QTest::newRow("aligned-odd") - << QString::fromRawData(ptr, 63) << QString::fromRawData(ptr + 2, 63); - QTest::newRow("aligned-even") - << QString::fromRawData(ptr, 64) << QString::fromRawData(ptr + 2, 64); - QTest::newRow("unaligned-even") - << QString::fromRawData(ptr, 63) << QString::fromRawData(ptr + 1, 63); - QTest::newRow("unaligned-odd") - << QString::fromRawData(ptr, 64) << QString::fromRawData(ptr + 1, 64); + // try to avoid crossing a cache line (that is, at ptr[64]) + QTest::newRow("aligned-aligned-4n") + << QString::fromRawData(ptr, 60) << QString::fromRawData(ptr + 2, 60); + QTest::newRow("aligned-unaligned-4n") + << QString::fromRawData(ptr, 60) << QString::fromRawData(ptr + 1, 60); + QTest::newRow("unaligned-unaligned-4n") + << QString::fromRawData(ptr + 1, 60) << QString::fromRawData(ptr + 3, 60); + + QTest::newRow("aligned-aligned-4n+1") + << QString::fromRawData(ptr, 61) << QString::fromRawData(ptr + 2, 61); + QTest::newRow("aligned-unaligned-4n+1") + << QString::fromRawData(ptr, 61) << QString::fromRawData(ptr + 1, 61); + QTest::newRow("unaligned-unaligned-4n+1") + << QString::fromRawData(ptr + 1, 61) << QString::fromRawData(ptr + 3, 61); + + QTest::newRow("aligned-aligned-4n-1") + << QString::fromRawData(ptr, 59) << QString::fromRawData(ptr + 2, 59); + QTest::newRow("aligned-unaligned-4n-1") + << QString::fromRawData(ptr, 59) << QString::fromRawData(ptr + 1, 59); + QTest::newRow("unaligned-unaligned-4n-1") + << QString::fromRawData(ptr + 1, 59) << QString::fromRawData(ptr + 3, 59); + + QTest::newRow("aligned-aligned-2n") + << QString::fromRawData(ptr, 58) << QString::fromRawData(ptr + 2, 58); + QTest::newRow("aligned-unaligned-2n") + << QString::fromRawData(ptr, 58) << QString::fromRawData(ptr + 1, 58); + QTest::newRow("unaligned-unaligned-2n") + << QString::fromRawData(ptr + 1, 58) << QString::fromRawData(ptr + 3, 58); } QTEST_MAIN(tst_QString) -- cgit v0.12