summaryrefslogtreecommitdiffstats
path: root/tests/benchmarks/corelib
diff options
context:
space:
mode:
authorThiago Macieira <thiago.macieira@nokia.com>2011-03-19 10:03:50 (GMT)
committerThiago Macieira <thiago.macieira@nokia.com>2011-03-22 14:51:55 (GMT)
commit86ee899d3d01463c55ee9ba753ee3d47f87ad07d (patch)
tree84ca9b450f0dd7346a64ac4576de298986e72334 /tests/benchmarks/corelib
parentcd0518deb6cf07571f1331ca83d1b5a97b3ca47e (diff)
downloadQt-86ee899d3d01463c55ee9ba753ee3d47f87ad07d.zip
Qt-86ee899d3d01463c55ee9ba753ee3d47f87ad07d.tar.gz
Qt-86ee899d3d01463c55ee9ba753ee3d47f87ad07d.tar.bz2
Improve a little more the core loop and propagate to the other code
Currently are that the "improved SSE2" version and the SSE4.1 version are yielding the best results, within 1% of each other. These results are around 20% better than the Qt 4.7 code.
Diffstat (limited to 'tests/benchmarks/corelib')
-rw-r--r--tests/benchmarks/corelib/tools/qstring/main.cpp40
1 files changed, 22 insertions, 18 deletions
diff --git a/tests/benchmarks/corelib/tools/qstring/main.cpp b/tests/benchmarks/corelib/tools/qstring/main.cpp
index 4a03e5a..f2d6de7 100644
--- a/tests/benchmarks/corelib/tools/qstring/main.cpp
+++ b/tests/benchmarks/corelib/tools/qstring/main.cpp
@@ -1517,7 +1517,8 @@ void fromLatin1_sse2_improved(ushort *dst, const char *str, int size)
{
const __m128i nullMask = _mm_set1_epi32(0);
qptrdiff counter = 0;
- while (size - counter >= 16) {
+ size -= 16;
+ while (size >= counter) {
const __m128i chunk = _mm_loadu_si128((__m128i*)(str + counter)); // load
// unpack the first 8 bytes, padding with zeros
@@ -1530,6 +1531,7 @@ void fromLatin1_sse2_improved(ushort *dst, const char *str, int size)
counter += 16;
}
+ size += 16;
fromLatin1_epilog(dst + counter, str + counter, size - counter);
}
@@ -1584,7 +1586,7 @@ void fromLatin1_prolog_sse2_overcommit(ushort *dst, const char *str, int)
template<FromLatin1Function prologFunction>
void fromLatin1_sse2_withprolog(ushort *dst, const char *str, int size)
{
- // same as the Qt 4.7 code, but we attempt to align at the prolog
+ // same as the improved code, but we attempt to align at the prolog
// therefore, we issue aligned stores
if (size >= 16) {
@@ -1599,43 +1601,45 @@ void fromLatin1_sse2_withprolog(ushort *dst, const char *str, int size)
}
const __m128i nullMask = _mm_set1_epi32(0);
- while (size >= 16) {
- const __m128i chunk = _mm_loadu_si128((__m128i*)str); // load
+ qptrdiff counter = 0;
+ size -= 16;
+ while (size >= counter) {
+ const __m128i chunk = _mm_loadu_si128((__m128i*)(str + counter)); // load
// unpack the first 8 bytes, padding with zeros
const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask);
- _mm_store_si128((__m128i*)dst, firstHalf); // store
+ _mm_store_si128((__m128i*)(dst + counter), firstHalf); // store
// unpack the last 8 bytes, padding with zeros
const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask);
- _mm_store_si128((__m128i*)(dst + 8), secondHalf); // store
+ _mm_store_si128((__m128i*)(dst + counter + 8), secondHalf); // store
- str += 16;
- dst += 16;
- size -= 16;
+ counter += 16;
}
- fromLatin1_epilog(dst, str, size);
+ size += 16;
+ fromLatin1_epilog(dst + counter, str + counter, size - counter);
}
void fromLatin1_sse4_pmovzxbw(ushort *dst, const char *str, int size)
{
- while (size >= 16) {
- __m128i chunk = _mm_loadu_si128((__m128i*)str); // load
+ qptrdiff counter = 0;
+ size -= 16;
+ while (size >= counter) {
+ __m128i chunk = _mm_loadu_si128((__m128i*)(str + counter)); // load
// unpack the first 8 bytes, padding with zeros
const __m128i firstHalf = _mm_cvtepu8_epi16(chunk);
- _mm_storeu_si128((__m128i*)dst, firstHalf); // store
+ _mm_storeu_si128((__m128i*)(dst + counter), firstHalf); // store
// unpack the last 8 bytes, padding with zeros
chunk = _mm_srli_si128(chunk, 8);
const __m128i secondHalf = _mm_cvtepu8_epi16(chunk);
- _mm_storeu_si128((__m128i*)(dst + 8), secondHalf); // store
+ _mm_storeu_si128((__m128i*)(dst + counter + 8), secondHalf); // store
- str += 16;
- dst += 16;
- size -= 16;
+ counter += 16;
}
- fromLatin1_epilog(dst, str, size);
+ size += 16;
+ fromLatin1_epilog(dst + counter, str + counter, size - counter);
}
void fromLatin1_prolog_sse4_overcommit(ushort *dst, const char *str, int)