summaryrefslogtreecommitdiffstats
path: root/src/corelib/tools/qstring.cpp
diff options
context:
space:
mode:
authorQt Continuous Integration System <qt-info@nokia.com>2010-08-25 01:42:44 (GMT)
committerQt Continuous Integration System <qt-info@nokia.com>2010-08-25 01:42:44 (GMT)
commite141cb25e57b40af839abf872f65895724a39e88 (patch)
treeda07d2abc2e5b58dd9d1dadaf5f819b28dcf7231 /src/corelib/tools/qstring.cpp
parent675188b766b7b0f7eb65c2e9f3ee7d6017e63453 (diff)
parented1fecfe6b7ce370184ef4dfb421fb387807633b (diff)
downloadQt-e141cb25e57b40af839abf872f65895724a39e88.zip
Qt-e141cb25e57b40af839abf872f65895724a39e88.tar.gz
Qt-e141cb25e57b40af839abf872f65895724a39e88.tar.bz2
Merge branch '4.7' of scm.dev.nokia.troll.no:qt/oslo-staging-1 into 4.7-integration
* '4.7' of scm.dev.nokia.troll.no:qt/oslo-staging-1: (50 commits) Add the missing license headers to the QString benchmark data Fix building of qsimd.cpp on Windows CE Use QElapsedTimer for the benchlib tests. Properly implement the CPU feature disabling in qsimd.cpp. Report the detected CPU features in the corelib boilerplate Detect CPU features on ARM by reading the ELF auxvec. Split the CPU-detection code into multiple functions for readability Fixed delivering gestures to a toplevel widget. Unroll the SSSE3 code even more to avoid the need to keep an extra variable for inverting the result Don't try to compile the SSE2 and SSSE3 code with compilers that don't support them (e.g. ARM) Improve on the SSSE3 with alternate aligning function. Add the beginnings of a new SSSE3-based aligning algorithm Small fixup Update the SSSE3-with-alignment function to use aligned loads. Add an ucstrncmp that uses SSSE3 with aligning. Add an SSSE3 version of ucstrncmp Optimise the tail comparison of ucstrncmp Add a version of ucstrncmp with SSE2 with aligning. Add an SSE2-optimised version of ucstrncmp Add the ucstrncmp benchmarks ...
Diffstat (limited to 'src/corelib/tools/qstring.cpp')
-rw-r--r--src/corelib/tools/qstring.cpp63
1 files changed, 48 insertions, 15 deletions
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp
index 2fd9a0b..d940bf8 100644
--- a/src/corelib/tools/qstring.cpp
+++ b/src/corelib/tools/qstring.cpp
@@ -173,7 +173,7 @@ static int ucstricmp(const ushort *a, const ushort *ae, const uchar *b)
return 1;
}
-// Unicode case-insensitive comparison
+// Unicode case-sensitive comparison
static int ucstrcmp(const QChar *a, int alen, const QChar *b, int blen)
{
if (a == b && alen == blen)
@@ -202,22 +202,55 @@ static int ucstrnicmp(const ushort *a, const ushort *b, int l)
return ucstricmp(a, a + l, b, b + l);
}
+// Benchmarking indicates that doing memcmp is much slower than
+// executing the comparison ourselves.
+//
+// The profiling was done on a population of calls to qMemEquals, generated
+// during a run of the demo browser. The profile of the data (32-bit x86
+// Linux) was:
+//
+// total number of comparisons: 21353
+// longest string compared: 95
+// average comparison length: 14.8786
+// cache-line crosses: 5661 (13.3%)
+// alignment histogram:
+// 0xXXX0 = 512 (1.2%) strings, 0 (0.0%) of which same-aligned
+// 0xXXX2 = 15087 (35.3%) strings, 5145 (34.1%) of which same-aligned
+// 0xXXX4 = 525 (1.2%) strings, 0 (0.0%) of which same-aligned
+// 0xXXX6 = 557 (1.3%) strings, 6 (1.1%) of which same-aligned
+// 0xXXX8 = 509 (1.2%) strings, 0 (0.0%) of which same-aligned
+// 0xXXXa = 24358 (57.0%) strings, 9901 (40.6%) of which same-aligned
+// 0xXXXc = 557 (1.3%) strings, 0 (0.0%) of which same-aligned
+// 0xXXXe = 601 (1.4%) strings, 15 (2.5%) of which same-aligned
+// total = 42706 (100%) strings, 15067 (35.3%) of which same-aligned
+//
+// 92% of the strings have alignment of 2 or 10, which is due to malloc on
+// 32-bit Linux returning values aligned to 8 bytes, and offsetof(array, QString::Data) == 18.
+//
+// The profile on 64-bit will be different since offsetof(array, QString::Data) == 26.
+//
+// The benchmark results were, for a Core-i7 @ 2.67 GHz 32-bit, compiled with -O3 -funroll-loops:
+// 16-bit loads only: 872,301 CPU ticks [Qt 4.5 / memcmp]
+// 32- and 16-bit loads: 773,362 CPU ticks [Qt 4.6]
+// SSE2 "movdqu" 128-bit loads: 618,736 CPU ticks
+// SSE3 "lddqu" 128-bit loads: 619,954 CPU ticks
+// SSSE3 "palignr" corrections: 852,147 CPU ticks
+// SSE4.2 "pcmpestrm": 738,702 CPU ticks
+//
+// The same benchmark on an Atom N450 @ 1.66 GHz, is:
+// 16-bit loads only: 2,185,882 CPU ticks
+// 32- and 16-bit loads: 1,805,060 CPU ticks
+// SSE2 "movdqu" 128-bit loads: 2,529,843 CPU ticks
+// SSE3 "lddqu" 128-bit loads: 2,514,858 CPU ticks
+// SSSE3 "palignr" corrections: 2,160,325 CPU ticks
+// SSE4.2 not available
+//
+// The conclusion we reach is that alignment the SSE2 unaligned code can gain
+// 20% improvement in performance in some systems, but suffers a penalty due
+// to the unaligned loads on others.
+
static bool qMemEquals(const quint16 *a, const quint16 *b, int length)
{
- // Benchmarking indicates that doing memcmp is much slower than
- // executing the comparison ourselves.
- // To make it even faster, we do a 32-bit comparison, comparing
- // twice the amount of data as a normal word-by-word comparison.
- //
- // Benchmarking results on a 2.33 GHz Core2 Duo, with a 64-QChar
- // block of data, with 4194304 iterations (per iteration):
- // operation usec cpu ticks
- // memcmp 330 710
- // 16-bit 79 167-171
- // 32-bit aligned 49 105-109
- //
- // Testing also indicates that unaligned 32-bit loads are as
- // performant as 32-bit aligned.
if (a == b || !length)
return true;