diff options
author | Thiago Macieira <thiago.macieira@nokia.com> | 2011-03-19 10:03:50 (GMT) |
---|---|---|
committer | Thiago Macieira <thiago.macieira@nokia.com> | 2011-03-22 14:51:55 (GMT) |
commit | 86ee899d3d01463c55ee9ba753ee3d47f87ad07d (patch) | |
tree | 84ca9b450f0dd7346a64ac4576de298986e72334 /tests/benchmarks/corelib | |
parent | cd0518deb6cf07571f1331ca83d1b5a97b3ca47e (diff) | |
download | Qt-86ee899d3d01463c55ee9ba753ee3d47f87ad07d.zip Qt-86ee899d3d01463c55ee9ba753ee3d47f87ad07d.tar.gz Qt-86ee899d3d01463c55ee9ba753ee3d47f87ad07d.tar.bz2 |
Improve a little more the core loop and propagate to the other code
Currently are that the "improved SSE2" version and the SSE4.1 version
are yielding the best results, within 1% of each other. These results
are around 20% better than the Qt 4.7 code.
Diffstat (limited to 'tests/benchmarks/corelib')
-rw-r--r-- | tests/benchmarks/corelib/tools/qstring/main.cpp | 40 |
1 files changed, 22 insertions, 18 deletions
diff --git a/tests/benchmarks/corelib/tools/qstring/main.cpp b/tests/benchmarks/corelib/tools/qstring/main.cpp index 4a03e5a..f2d6de7 100644 --- a/tests/benchmarks/corelib/tools/qstring/main.cpp +++ b/tests/benchmarks/corelib/tools/qstring/main.cpp @@ -1517,7 +1517,8 @@ void fromLatin1_sse2_improved(ushort *dst, const char *str, int size) { const __m128i nullMask = _mm_set1_epi32(0); qptrdiff counter = 0; - while (size - counter >= 16) { + size -= 16; + while (size >= counter) { const __m128i chunk = _mm_loadu_si128((__m128i*)(str + counter)); // load // unpack the first 8 bytes, padding with zeros @@ -1530,6 +1531,7 @@ void fromLatin1_sse2_improved(ushort *dst, const char *str, int size) counter += 16; } + size += 16; fromLatin1_epilog(dst + counter, str + counter, size - counter); } @@ -1584,7 +1586,7 @@ void fromLatin1_prolog_sse2_overcommit(ushort *dst, const char *str, int) template<FromLatin1Function prologFunction> void fromLatin1_sse2_withprolog(ushort *dst, const char *str, int size) { - // same as the Qt 4.7 code, but we attempt to align at the prolog + // same as the improved code, but we attempt to align at the prolog // therefore, we issue aligned stores if (size >= 16) { @@ -1599,43 +1601,45 @@ void fromLatin1_sse2_withprolog(ushort *dst, const char *str, int size) } const __m128i nullMask = _mm_set1_epi32(0); - while (size >= 16) { - const __m128i chunk = _mm_loadu_si128((__m128i*)str); // load + qptrdiff counter = 0; + size -= 16; + while (size >= counter) { + const __m128i chunk = _mm_loadu_si128((__m128i*)(str + counter)); // load // unpack the first 8 bytes, padding with zeros const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask); - _mm_store_si128((__m128i*)dst, firstHalf); // store + _mm_store_si128((__m128i*)(dst + counter), firstHalf); // store // unpack the last 8 bytes, padding with zeros const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask); - _mm_store_si128((__m128i*)(dst + 8), secondHalf); // store + _mm_store_si128((__m128i*)(dst + counter + 8), secondHalf); // store - str += 16; - dst += 16; - size -= 16; + counter += 16; } - fromLatin1_epilog(dst, str, size); + size += 16; + fromLatin1_epilog(dst + counter, str + counter, size - counter); } void fromLatin1_sse4_pmovzxbw(ushort *dst, const char *str, int size) { - while (size >= 16) { - __m128i chunk = _mm_loadu_si128((__m128i*)str); // load + qptrdiff counter = 0; + size -= 16; + while (size >= counter) { + __m128i chunk = _mm_loadu_si128((__m128i*)(str + counter)); // load // unpack the first 8 bytes, padding with zeros const __m128i firstHalf = _mm_cvtepu8_epi16(chunk); - _mm_storeu_si128((__m128i*)dst, firstHalf); // store + _mm_storeu_si128((__m128i*)(dst + counter), firstHalf); // store // unpack the last 8 bytes, padding with zeros chunk = _mm_srli_si128(chunk, 8); const __m128i secondHalf = _mm_cvtepu8_epi16(chunk); - _mm_storeu_si128((__m128i*)(dst + 8), secondHalf); // store + _mm_storeu_si128((__m128i*)(dst + counter + 8), secondHalf); // store - str += 16; - dst += 16; - size -= 16; + counter += 16; } - fromLatin1_epilog(dst, str, size); + size += 16; + fromLatin1_epilog(dst + counter, str + counter, size - counter); } void fromLatin1_prolog_sse4_overcommit(ushort *dst, const char *str, int) |