summaryrefslogtreecommitdiffstats
path: root/tests/benchmarks
diff options
context:
space:
mode:
authorThiago Macieira <thiago.macieira@nokia.com>2010-08-18 09:07:51 (GMT)
committerThiago Macieira <thiago.macieira@nokia.com>2010-08-24 10:36:39 (GMT)
commit6f913f94c6b6aaf8514bc62d4f9939ac44e211fb (patch)
tree2e4fb299f4ac864ad41fac78d9207b850e0b605a /tests/benchmarks
parent2bd9d7fbec0bb61298ba0f48a93a4a186b558a38 (diff)
downloadQt-6f913f94c6b6aaf8514bc62d4f9939ac44e211fb.zip
Qt-6f913f94c6b6aaf8514bc62d4f9939ac44e211fb.tar.gz
Qt-6f913f94c6b6aaf8514bc62d4f9939ac44e211fb.tar.bz2
Add a version of ucstrncmp with SSE2 with aligning.
This is a different technique of aligning. Instead of reading some bytes before the string, we will read some bytes of the string twice. Best results are only 2% improvement over the unaligned SSE2 on a Core-i7.
Diffstat (limited to 'tests/benchmarks')
-rw-r--r--tests/benchmarks/corelib/tools/qstring/main.cpp44
1 files changed, 43 insertions, 1 deletions
diff --git a/tests/benchmarks/corelib/tools/qstring/main.cpp b/tests/benchmarks/corelib/tools/qstring/main.cpp
index f338e49..608e2bc 100644
--- a/tests/benchmarks/corelib/tools/qstring/main.cpp
+++ b/tests/benchmarks/corelib/tools/qstring/main.cpp
@@ -913,6 +913,46 @@ static __attribute__((optimize("no-unroll-loops"))) int ucstrncmp_sse2(const ush
}
return ucstrncmp_shortwise(a + counter, b + counter, len);
}
+
+static __attribute__((optimize("no-unroll-loops"))) int ucstrncmp_sse2_aligning(const ushort *a, const ushort *b, int len)
+{
+ if (len >= 8) {
+ __m128i m1 = _mm_loadu_si128((__m128i *)a);
+ __m128i m2 = _mm_loadu_si128((__m128i *)b);
+ __m128i cmp = _mm_cmpeq_epi16(m1, m2);
+ ushort mask = ~uint(_mm_movemask_epi8(cmp));
+ if (mask) {
+ // which ushort isn't equal?
+ int counter = bsf_nonzero(mask)/2;
+ return a[counter] - b[counter];
+ }
+
+
+ // now align to do 16-byte loads
+ int diff = 8 - (quintptr(a) & 0xf)/2;
+ len -= diff;
+ a += diff;
+ b += diff;
+ }
+
+ qptrdiff counter = 0;
+ while (len >= 8) {
+ __m128i m1 = _mm_load_si128((__m128i *)(a + counter));
+ __m128i m2 = _mm_loadu_si128((__m128i *)(b + counter));
+ __m128i cmp = _mm_cmpeq_epi16(m1, m2);
+ ushort mask = ~uint(_mm_movemask_epi8(cmp));
+ if (mask) {
+ // which ushort isn't equal?
+ counter += bsf_nonzero(mask)/2;
+ return a[counter] - b[counter];
+ }
+
+ counter += 8;
+ len -= 8;
+ }
+ return ucstrncmp_shortwise(a + counter, b + counter, len);
+}
+
#endif
typedef int (* UcstrncmpFunction)(const ushort *, const ushort *, int);
@@ -925,6 +965,7 @@ void tst_QString::ucstrncmp_data() const
QTest::newRow("shortwise") << &ucstrncmp_shortwise;
QTest::newRow("intwise") << &ucstrncmp_intwise;
QTest::newRow("sse2") << &ucstrncmp_sse2;
+ QTest::newRow("sse2_aligning") << &ucstrncmp_sse2_aligning;
}
void tst_QString::ucstrncmp() const
@@ -934,7 +975,8 @@ void tst_QString::ucstrncmp() const
static const UcstrncmpFunction func[] = {
&ucstrncmp_shortwise,
&ucstrncmp_intwise,
- &ucstrncmp_sse2
+ &ucstrncmp_sse2,
+ &ucstrncmp_sse2_aligning
};
static const int functionCount = sizeof func / sizeof func[0];