From 4f891889118d4bcc417382a0a646f3683c621b10 Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Wed, 18 Aug 2010 12:36:07 +0200 Subject: Add an ucstrncmp that uses SSSE3 with aligning. The results on i7 are 32% improvement over current code, 13% improvement over 4-byte loads, 6% over the unaligned SSSE3 loads. However, it's about 2.5% slower than pure SSE2 code due to complexity. The results on Atom are 30% improvement over current code, 7% over 4-byte loads, 15% over pure unaligned SSE2 and 9% over unaligned SSSE3. --- tests/benchmarks/corelib/tools/qstring/main.cpp | 53 ++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/tests/benchmarks/corelib/tools/qstring/main.cpp b/tests/benchmarks/corelib/tools/qstring/main.cpp index 3c3d1ad..d40b9bc 100644 --- a/tests/benchmarks/corelib/tools/qstring/main.cpp +++ b/tests/benchmarks/corelib/tools/qstring/main.cpp @@ -1063,6 +1063,55 @@ static int ucstrncmp_ssse3(const ushort *a, const ushort *b, int len) return ucstrncmp_short_tail(a, b, len); } +static int ucstrncmp_ssse3_aligning(const ushort *a, const ushort *b, int len) +{ + if (len >= 8) { + __m128i m1 = _mm_loadu_si128((__m128i *)a); + __m128i m2 = _mm_loadu_si128((__m128i *)b); + __m128i cmp = _mm_cmpeq_epi16(m1, m2); + ushort mask = ~uint(_mm_movemask_epi8(cmp)); + if (mask) { + // which ushort isn't equal? + int counter = bsf_nonzero(mask)/2; + return a[counter] - b[counter]; + } + + + // now 'b' align to do 16-byte loads + int diff = 8 - (quintptr(b) & 0xf)/2; + len -= diff; + a += diff; + b += diff; + } + + if (len < 8) + return ucstrncmp_short_tail(a, b, len); + + // 'b' is aligned + int val = quintptr(a) & 0xf; + a -= val/2; + + if (val == 8) + return ucstrncmp_ssse3_alignr<8>(a, b, len); + else if (val == 0) + return ucstrncmp_sse2_aligned(a, b, len); + if (val < 8) { + if (val < 4) + return ucstrncmp_ssse3_alignr<2>(a, b, len); + else if (val == 4) + return ucstrncmp_ssse3_alignr<4>(a, b, len); + else + return ucstrncmp_ssse3_alignr<6>(a, b, len); + } else { + if (val < 12) + return ucstrncmp_ssse3_alignr<10>(a, b, len); + else if (val == 12) + return ucstrncmp_ssse3_alignr<12>(a, b, len); + else + return ucstrncmp_ssse3_alignr<14>(a, b, len); + } +} + #endif typedef int (* UcstrncmpFunction)(const ushort *, const ushort *, int); @@ -1077,6 +1126,7 @@ void tst_QString::ucstrncmp_data() const QTest::newRow("sse2") << &ucstrncmp_sse2; QTest::newRow("sse2_aligning") << &ucstrncmp_sse2_aligning; QTest::newRow("ssse3") << &ucstrncmp_ssse3; + QTest::newRow("ssse3_aligning") << &ucstrncmp_ssse3_aligning; } void tst_QString::ucstrncmp() const @@ -1088,7 +1138,8 @@ void tst_QString::ucstrncmp() const &ucstrncmp_intwise, &ucstrncmp_sse2, &ucstrncmp_sse2_aligning, - &ucstrncmp_ssse3 + &ucstrncmp_ssse3, + &ucstrncmp_ssse3_aligning }; static const int functionCount = sizeof func / sizeof func[0]; -- cgit v0.12