From 531a8f198c152e1135db103f22bab648a314926e Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Thu, 12 Aug 2010 21:18:49 +0200 Subject: Add an SSE2 comparison with prolog The prolog tries to align p1 to a multiple of 16, so as to run aligned loads, which are faster. Unfortunately, my tests so far indicate that the prolog ends up taking longer than the benefit of having aligned loads. --- tests/benchmarks/corelib/tools/qstring/main.cpp | 38 +++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tests/benchmarks/corelib/tools/qstring/main.cpp b/tests/benchmarks/corelib/tools/qstring/main.cpp index 5210034..4e5d1c0 100644 --- a/tests/benchmarks/corelib/tools/qstring/main.cpp +++ b/tests/benchmarks/corelib/tools/qstring/main.cpp @@ -238,6 +238,42 @@ static bool equals2_sse2(ushort *p1, ushort *p2, int len) return equals2_shortwise(p1, p2, len); } + +static inline +#ifdef Q_CC_GNU +__attribute__((always_inline)) +#endif +bool prolog_align(ushort *&p1, ushort *&p2, int &len) +{ + const ushort *end = (ushort*) ((quintptr(p1) + 15) & ~15); + if (end > p1 + len) + end = p1 + len; + for ( ; p1 != end; ++p1, ++p2, --len) + if (*p1 != *p2) + return false; + return true; +} + +static bool equals2_sse2_aligning(ushort *p1, ushort *p2, int len) +{ + if (len > 8) { + if (!prolog_align(p1, p2, len)) + return false; + while (len > 8) { + __m128i q1 = _mm_load_si128((__m128i *)p1); + __m128i q2 = _mm_loadu_si128((__m128i *)p2); + __m128i cmp = _mm_cmpeq_epi16(q1, q2); + if (ushort(_mm_movemask_epi8(cmp)) != 0xffff) + return false; + + len -= 8; + p1 += 8; + p2 += 8; + } + } + + return equals2_shortwise(p1, p2, len); +} #endif void tst_QString::equals2_data() const @@ -250,6 +286,7 @@ void tst_QString::equals2_data() const QTest::newRow("intwise") << 3; #ifdef __SSE2__ QTest::newRow("sse2") << 4; + QTest::newRow("sse2_aligning") << 5; #endif } @@ -300,6 +337,7 @@ void tst_QString::equals2() const equals2_intwise, // 3 #ifdef __SSE2__ equals2_sse2, // 4 + equals2_sse2_aligning, // 5 #endif 0 }; -- cgit v0.12