diff options
author | Thiago Macieira <thiago.macieira@nokia.com> | 2010-08-12 21:12:21 (GMT) |
---|---|---|
committer | Thiago Macieira <thiago.macieira@nokia.com> | 2010-08-24 10:36:33 (GMT) |
commit | 7790cf5b2922a7adf684dc0b7cd0fc1583c0684a (patch) | |
tree | 87d64032de208cb9b74bdb05b163c74617ca5b48 /tests | |
parent | 531a8f198c152e1135db103f22bab648a314926e (diff) | |
download | Qt-7790cf5b2922a7adf684dc0b7cd0fc1583c0684a.zip Qt-7790cf5b2922a7adf684dc0b7cd0fc1583c0684a.tar.gz Qt-7790cf5b2922a7adf684dc0b7cd0fc1583c0684a.tar.bz2 |
Add an SSSE3 version that uses palignr to align.
Instead of using a non-SIMD method for aligning, we instead load more bytes
from p1 and use the PALIGNR instruction to realign to what we want.
The result is that it's bit slower than the non-SIMD comparison, due to the
complexity. For strings over 8 QChars wide, it's only slightly worse than
the non-SIMD comparison.
Diffstat (limited to 'tests')
-rw-r--r-- | tests/benchmarks/corelib/tools/qstring/main.cpp | 110 | ||||
-rw-r--r-- | tests/benchmarks/corelib/tools/qstring/qstring.pro | 3 |
2 files changed, 112 insertions, 1 deletions
diff --git a/tests/benchmarks/corelib/tools/qstring/main.cpp b/tests/benchmarks/corelib/tools/qstring/main.cpp index 4e5d1c0..e1800c0 100644 --- a/tests/benchmarks/corelib/tools/qstring/main.cpp +++ b/tests/benchmarks/corelib/tools/qstring/main.cpp @@ -274,6 +274,110 @@ static bool equals2_sse2_aligning(ushort *p1, ushort *p2, int len) return equals2_shortwise(p1, p2, len); } + +template<int N> static inline bool equals2_ssse3_alignr(__m128i *m1, __m128i *m2, int len) +{ + __m128i lower = _mm_load_si128(m1); + while (len > 8) { + __m128i upper = _mm_load_si128(m1 + 1); + __m128i correct; + correct = _mm_alignr_epi8(upper, lower, N); + + __m128i q2 = _mm_loadu_si128(m2); + __m128i cmp = _mm_cmpeq_epi16(correct, q2); + if (ushort(_mm_movemask_epi8(cmp)) != 0xffff) + return false; + + len -= 8; + ++m2; + ++m1; + lower = upper; + } + + // tail + return len == 0 || equals2_shortwise((ushort *)m1 + N / 2, (ushort*)m2, len); +} + +static inline bool equals2_ssse3_aligned(__m128i *m1, __m128i *m2, int len) +{ + while (len > 8) { + __m128i q2 = _mm_loadu_si128(m2); + __m128i cmp = _mm_cmpeq_epi16(*m1, q2); + if (ushort(_mm_movemask_epi8(cmp)) != 0xffff) + return false; + + len -= 8; + ++m1; + ++m2; + } + return len == 0 || equals2_shortwise((ushort *)m1, (ushort *)m2, len); +} + +//#ifdef __SSSE3__ +static bool equals2_ssse3(ushort *p1, ushort *p2, int len) +{ + // p1 & 0xf can be: + // 0, 2, 4, 6, 8, 10, 12, 14 + // If it's 0, we're aligned + // If it's not, then we're interested in the 16 - (p1 & 0xf) bytes only + + if (len > 8) { + // find the last aligned position below the p1 memory + __m128i *m1 = (__m128i *)(quintptr(p1) & ~0xf); + __m128i *m2 = (__m128i *)p2; + uchar diff = quintptr(p1) - quintptr(m1); + + // diff contains the number of extra bytes + if (diff < 8) { + if (diff < 4) { + if (diff == 0) + return equals2_ssse3_aligned(m1, m2, len); + else // diff == 2 + return equals2_ssse3_alignr<2>(m1, m2, len); + } else { + if (diff == 4) + return equals2_ssse3_alignr<4>(m1, m2, len); + else // diff == 6 + return equals2_ssse3_alignr<6>(m1, m2, len); + } + } else { + if (diff < 12) { + if (diff == 8) + return equals2_ssse3_alignr<8>(m1, m2, len); + else // diff == 10 + return equals2_ssse3_alignr<10>(m1, m2, len); + } else { + if (diff == 12) + return equals2_ssse3_alignr<12>(m1, m2, len); + else // diff == 14 + return equals2_ssse3_alignr<14>(m1, m2, len); + } + } + +// switch (diff) { +// case 0: +// return equals2_ssse3_aligned(m1, m2, len); +// case 2: +// return equals2_ssse3_alignr<2>(m1, m2, len); +// case 4: +// return equals2_ssse3_alignr<4>(m1, m2, len); +// case 6: +// return equals2_ssse3_alignr<6>(m1, m2, len); +// case 8: +// return equals2_ssse3_alignr<8>(m1, m2, len); +// case 10: +// return equals2_ssse3_alignr<10>(m1, m2, len); +// case 12: +// return equals2_ssse3_alignr<12>(m1, m2, len); +// case 14: +// return equals2_ssse3_alignr<14>(m1, m2, len); +// } + } + + // tail + return equals2_shortwise(p1, p2, len); +} +//#endif #endif void tst_QString::equals2_data() const @@ -287,6 +391,9 @@ void tst_QString::equals2_data() const #ifdef __SSE2__ QTest::newRow("sse2") << 4; QTest::newRow("sse2_aligning") << 5; +#ifdef __SSSE3__ + QTest::newRow("ssse3") << 6; +#endif #endif } @@ -338,6 +445,9 @@ void tst_QString::equals2() const #ifdef __SSE2__ equals2_sse2, // 4 equals2_sse2_aligning, // 5 +#ifdef __SSSE3__ + equals2_ssse3, // 6 +#endif #endif 0 }; diff --git a/tests/benchmarks/corelib/tools/qstring/qstring.pro b/tests/benchmarks/corelib/tools/qstring/qstring.pro index 388e3c2..bc6254c 100644 --- a/tests/benchmarks/corelib/tools/qstring/qstring.pro +++ b/tests/benchmarks/corelib/tools/qstring/qstring.pro @@ -14,4 +14,5 @@ wince*:{ DEFINES += SRCDIR=\\\"$$PWD/\\\" } -sse2:QMAKE_CXXFLAGS += -msse2 +ssse3:QMAKE_FLAGS += -mssse3 +else:sse2:QMAKE_CXXFLAGS += -msse2 |