diff options
author | Thiago Macieira <thiago.macieira@nokia.com> | 2011-03-24 09:27:43 (GMT) |
---|---|---|
committer | Thiago Macieira <thiago.macieira@nokia.com> | 2011-03-27 20:15:57 (GMT) |
commit | 6567e793d0f1388ef6389ef91ace1ed8dc726019 (patch) | |
tree | fd5a8e4644f7c1eafd3dfd7c33423960e6e504ab | |
parent | 6c81dd5c34edb66f7947751bf3ed7b134e1ffca4 (diff) | |
download | Qt-6567e793d0f1388ef6389ef91ace1ed8dc726019.zip Qt-6567e793d0f1388ef6389ef91ace1ed8dc726019.tar.gz Qt-6567e793d0f1388ef6389ef91ace1ed8dc726019.tar.bz2 |
Add a once-over unrolled fromLatin1 conversion (32 characters)
This was suggested by Matthias Kretz on my blog. It improves on GCC
(one case up to 5%), but worsens on ICC (up to 5%).
-rw-r--r-- | tests/benchmarks/corelib/tools/qstring/main.cpp | 46 |
1 files changed, 46 insertions, 0 deletions
diff --git a/tests/benchmarks/corelib/tools/qstring/main.cpp b/tests/benchmarks/corelib/tools/qstring/main.cpp index 1a38354..96f2c30 100644 --- a/tests/benchmarks/corelib/tools/qstring/main.cpp +++ b/tests/benchmarks/corelib/tools/qstring/main.cpp @@ -1538,6 +1538,51 @@ void fromLatin1_sse2_improved(ushort *dst, const char *str, int size) fromLatin1_epilog(dst + counter, str + counter, size - counter); } +void fromLatin1_sse2_improved2(ushort *dst, const char *str, int size) +{ + const __m128i nullMask = _mm_set1_epi32(0); + qptrdiff counter = 0; + size -= 32; + while (size >= counter) { + const __m128i chunk1 = _mm_loadu_si128((__m128i*)(str + counter)); // load + const __m128i chunk2 = _mm_loadu_si128((__m128i*)(str + counter + 16)); // load + + // unpack the first 8 bytes, padding with zeros + const __m128i firstHalf1 = _mm_unpacklo_epi8(chunk1, nullMask); + _mm_storeu_si128((__m128i*)(dst + counter), firstHalf1); // store + + // unpack the last 8 bytes, padding with zeros + const __m128i secondHalf1 = _mm_unpackhi_epi8(chunk1, nullMask); + _mm_storeu_si128((__m128i*)(dst + counter + 8), secondHalf1); // store + + // unpack the first 8 bytes, padding with zeros + const __m128i firstHalf2 = _mm_unpacklo_epi8(chunk2, nullMask); + _mm_storeu_si128((__m128i*)(dst + counter + 16), firstHalf2); // store + + // unpack the last 8 bytes, padding with zeros + const __m128i secondHalf2 = _mm_unpackhi_epi8(chunk2, nullMask); + _mm_storeu_si128((__m128i*)(dst + counter + 24), secondHalf2); // store + + counter += 32; + } + size += 16; + if (size >= counter) { + const __m128i chunk = _mm_loadu_si128((__m128i*)(str + counter)); // load + + // unpack the first 8 bytes, padding with zeros + const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask); + _mm_storeu_si128((__m128i*)(dst + counter), firstHalf); // store + + // unpack the last 8 bytes, padding with zeros + const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask); + _mm_storeu_si128((__m128i*)(dst + counter + 8), secondHalf); // store + + counter += 16; + } + size += 16; + fromLatin1_epilog(dst + counter, str + counter, size - counter); +} + void fromLatin1_prolog_unrolled(ushort *dst, const char *str, int size) { // QString's data pointer is most often ending in 0x2 or 0xa @@ -1787,6 +1832,7 @@ void tst_QString::fromLatin1Alternatives_data() const #ifdef __SSE2__ QTest::newRow("sse2-qt4.7") << &fromLatin1_sse2_qt47; QTest::newRow("sse2-improved") << &fromLatin1_sse2_improved; + QTest::newRow("sse2-improved2") << &fromLatin1_sse2_improved2; QTest::newRow("sse2-with-prolog-regular") << &fromLatin1_sse2_withprolog<&fromLatin1_regular>; QTest::newRow("sse2-with-prolog-unrolled") << &fromLatin1_sse2_withprolog<&fromLatin1_prolog_unrolled>; QTest::newRow("sse2-with-prolog-sse2-overcommit") << &fromLatin1_sse2_withprolog<&fromLatin1_prolog_sse2_overcommit>; |