Update comments in QString about alignment performance.

This commit makes no code change at all. Write down the conclusions from experimenting with the x86 SIMD instructions for faster string comparison routines. Long story short: we're already pretty good. The SSE4.2 string instructions are probably more useful for "implicit-length mode" (that is, the end of the string is marked by a NUL), rather than QString's "explicit-length mode".
author: Thiago Macieira <thiago.macieira@nokia.com> 2010-08-17 18:34:54 (GMT)
committer: Thiago Macieira <thiago.macieira@nokia.com> 2010-08-24 10:36:36 (GMT)
commit: 8063e04f3bf6e46ec972b6cafea39454a47e16de (patch)
tree: 113aefcfbd59adc7bbc84efe121a235108b0b675 /src/corelib
parent: c58293407519e7413cb6493f831ef69cf2323030 (diff)
download: Qt-8063e04f3bf6e46ec972b6cafea39454a47e16de.zip
Qt-8063e04f3bf6e46ec972b6cafea39454a47e16de.tar.gz
Qt-8063e04f3bf6e46ec972b6cafea39454a47e16de.tar.bz2
2 files changed, 49 insertions, 15 deletions
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp
index 2fd9a0b..d940bf8 100644
--- a/src/corelib/tools/qstring.cpp
+++ b/src/corelib/tools/qstring.cpp
@@ -173,7 +173,7 @@ static int ucstricmp(const ushort *a, const ushort *ae, const uchar *b)
     return 1;
 }
 
-// Unicode case-insensitive comparison
+// Unicode case-sensitive comparison
 static int ucstrcmp(const QChar *a, int alen, const QChar *b, int blen)
 {
     if (a == b && alen == blen)
@@ -202,22 +202,55 @@ static int ucstrnicmp(const ushort *a, const ushort *b, int l)
     return ucstricmp(a, a + l, b, b + l);
 }
 
+// Benchmarking indicates that doing memcmp is much slower than
+// executing the comparison ourselves.
+//
+// The profiling was done on a population of calls to qMemEquals, generated
+// during a run of the demo browser. The profile of the data (32-bit x86
+// Linux) was:
+//
+//  total number of comparisons: 21353
+//  longest string compared: 95
+//  average comparison length: 14.8786
+//  cache-line crosses: 5661 (13.3%)
+//  alignment histogram:
+//   0xXXX0 = 512 (1.2%) strings, 0 (0.0%) of which same-aligned
+//   0xXXX2 = 15087 (35.3%) strings, 5145 (34.1%) of which same-aligned
+//   0xXXX4 = 525 (1.2%) strings, 0 (0.0%) of which same-aligned
+//   0xXXX6 = 557 (1.3%) strings, 6 (1.1%) of which same-aligned
+//   0xXXX8 = 509 (1.2%) strings, 0 (0.0%) of which same-aligned
+//   0xXXXa = 24358 (57.0%) strings, 9901 (40.6%) of which same-aligned
+//   0xXXXc = 557 (1.3%) strings, 0 (0.0%) of which same-aligned
+//   0xXXXe = 601 (1.4%) strings, 15 (2.5%) of which same-aligned
+//   total  = 42706 (100%) strings, 15067 (35.3%) of which same-aligned
+//
+// 92% of the strings have alignment of 2 or 10, which is due to malloc on
+// 32-bit Linux returning values aligned to 8 bytes, and offsetof(array, QString::Data) == 18.
+//
+// The profile on 64-bit will be different since offsetof(array, QString::Data) == 26.
+//
+// The benchmark results were, for a Core-i7 @ 2.67 GHz 32-bit, compiled with -O3 -funroll-loops:
+//   16-bit loads only:           872,301 CPU ticks [Qt 4.5 / memcmp]
+//   32- and 16-bit loads:        773,362 CPU ticks [Qt 4.6]
+//   SSE2 "movdqu" 128-bit loads: 618,736 CPU ticks
+//   SSE3 "lddqu" 128-bit loads:  619,954 CPU ticks
+//   SSSE3 "palignr" corrections: 852,147 CPU ticks
+//   SSE4.2 "pcmpestrm":          738,702 CPU ticks
+//
+// The same benchmark on an Atom N450 @ 1.66 GHz, is:
+//  16-bit loads only:            2,185,882 CPU ticks
+//  32- and 16-bit loads:         1,805,060 CPU ticks
+//  SSE2 "movdqu" 128-bit loads:  2,529,843 CPU ticks
+//  SSE3 "lddqu" 128-bit loads:   2,514,858 CPU ticks
+//  SSSE3 "palignr" corrections:  2,160,325 CPU ticks
+//  SSE4.2 not available
+//
+// The conclusion we reach is that alignment the SSE2 unaligned code can gain
+// 20% improvement in performance in some systems, but suffers a penalty due
+// to the unaligned loads on others.
+
 static bool qMemEquals(const quint16 *a, const quint16 *b, int length)
 {
-    // Benchmarking indicates that doing memcmp is much slower than
-    // executing the comparison ourselves.
-    // To make it even faster, we do a 32-bit comparison, comparing
-    // twice the amount of data as a normal word-by-word comparison.
-    //
-    // Benchmarking results on a 2.33 GHz Core2 Duo, with a 64-QChar
-    // block of data, with 4194304 iterations (per iteration):
-    //    operation             usec            cpu ticks
-    //     memcmp                330               710
-    //     16-bit                 79             167-171
-    //  32-bit aligned            49             105-109
-    //
-    // Testing also indicates that unaligned 32-bit loads are as
-    // performant as 32-bit aligned.
     if (a == b || !length)
         return true;
 
diff --git a/src/corelib/tools/qstring.h b/src/corelib/tools/qstring.h
index e52f59f..06e4d47 100644
--- a/src/corelib/tools/qstring.h
+++ b/src/corelib/tools/qstring.h
@@ -614,6 +614,7 @@ private:
         ushort asciiCache : 1;
         ushort capacity : 1;
         ushort reserved : 11;
+        // ### Qt5: try to ensure that "array" is aligned to 16 bytes on both 32- and 64-bit
         ushort array[1];
     };
     static Data shared_null;
author	Thiago Macieira <thiago.macieira@nokia.com>	2010-08-17 18:34:54 (GMT)
committer	Thiago Macieira <thiago.macieira@nokia.com>	2010-08-24 10:36:36 (GMT)
commit	8063e04f3bf6e46ec972b6cafea39454a47e16de (patch)
tree	113aefcfbd59adc7bbc84efe121a235108b0b675 /src/corelib
parent	c58293407519e7413cb6493f831ef69cf2323030 (diff)
download	Qt-8063e04f3bf6e46ec972b6cafea39454a47e16de.zip Qt-8063e04f3bf6e46ec972b6cafea39454a47e16de.tar.gz Qt-8063e04f3bf6e46ec972b6cafea39454a47e16de.tar.bz2