summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThiago Macieira <thiago.macieira@nokia.com>2010-08-18 10:36:07 (GMT)
committerThiago Macieira <thiago.macieira@nokia.com>2010-08-24 10:36:41 (GMT)
commit4f891889118d4bcc417382a0a646f3683c621b10 (patch)
treed557e21d6e54b1168571f29c39e56c46db5543da
parent08fa99c43897b16f8be924090316f5a4db548c10 (diff)
downloadQt-4f891889118d4bcc417382a0a646f3683c621b10.zip
Qt-4f891889118d4bcc417382a0a646f3683c621b10.tar.gz
Qt-4f891889118d4bcc417382a0a646f3683c621b10.tar.bz2
Add an ucstrncmp that uses SSSE3 with aligning.
The results on i7 are 32% improvement over current code, 13% improvement over 4-byte loads, 6% over the unaligned SSSE3 loads. However, it's about 2.5% slower than pure SSE2 code due to complexity. The results on Atom are 30% improvement over current code, 7% over 4-byte loads, 15% over pure unaligned SSE2 and 9% over unaligned SSSE3.
-rw-r--r--tests/benchmarks/corelib/tools/qstring/main.cpp53
1 files changed, 52 insertions, 1 deletions
diff --git a/tests/benchmarks/corelib/tools/qstring/main.cpp b/tests/benchmarks/corelib/tools/qstring/main.cpp
index 3c3d1ad..d40b9bc 100644
--- a/tests/benchmarks/corelib/tools/qstring/main.cpp
+++ b/tests/benchmarks/corelib/tools/qstring/main.cpp
@@ -1063,6 +1063,55 @@ static int ucstrncmp_ssse3(const ushort *a, const ushort *b, int len)
return ucstrncmp_short_tail(a, b, len);
}
+static int ucstrncmp_ssse3_aligning(const ushort *a, const ushort *b, int len)
+{
+ if (len >= 8) {
+ __m128i m1 = _mm_loadu_si128((__m128i *)a);
+ __m128i m2 = _mm_loadu_si128((__m128i *)b);
+ __m128i cmp = _mm_cmpeq_epi16(m1, m2);
+ ushort mask = ~uint(_mm_movemask_epi8(cmp));
+ if (mask) {
+ // which ushort isn't equal?
+ int counter = bsf_nonzero(mask)/2;
+ return a[counter] - b[counter];
+ }
+
+
+ // now 'b' align to do 16-byte loads
+ int diff = 8 - (quintptr(b) & 0xf)/2;
+ len -= diff;
+ a += diff;
+ b += diff;
+ }
+
+ if (len < 8)
+ return ucstrncmp_short_tail(a, b, len);
+
+ // 'b' is aligned
+ int val = quintptr(a) & 0xf;
+ a -= val/2;
+
+ if (val == 8)
+ return ucstrncmp_ssse3_alignr<8>(a, b, len);
+ else if (val == 0)
+ return ucstrncmp_sse2_aligned(a, b, len);
+ if (val < 8) {
+ if (val < 4)
+ return ucstrncmp_ssse3_alignr<2>(a, b, len);
+ else if (val == 4)
+ return ucstrncmp_ssse3_alignr<4>(a, b, len);
+ else
+ return ucstrncmp_ssse3_alignr<6>(a, b, len);
+ } else {
+ if (val < 12)
+ return ucstrncmp_ssse3_alignr<10>(a, b, len);
+ else if (val == 12)
+ return ucstrncmp_ssse3_alignr<12>(a, b, len);
+ else
+ return ucstrncmp_ssse3_alignr<14>(a, b, len);
+ }
+}
+
#endif
typedef int (* UcstrncmpFunction)(const ushort *, const ushort *, int);
@@ -1077,6 +1126,7 @@ void tst_QString::ucstrncmp_data() const
QTest::newRow("sse2") << &ucstrncmp_sse2;
QTest::newRow("sse2_aligning") << &ucstrncmp_sse2_aligning;
QTest::newRow("ssse3") << &ucstrncmp_ssse3;
+ QTest::newRow("ssse3_aligning") << &ucstrncmp_ssse3_aligning;
}
void tst_QString::ucstrncmp() const
@@ -1088,7 +1138,8 @@ void tst_QString::ucstrncmp() const
&ucstrncmp_intwise,
&ucstrncmp_sse2,
&ucstrncmp_sse2_aligning,
- &ucstrncmp_ssse3
+ &ucstrncmp_ssse3,
+ &ucstrncmp_ssse3_aligning
};
static const int functionCount = sizeof func / sizeof func[0];