From 86ee899d3d01463c55ee9ba753ee3d47f87ad07d Mon Sep 17 00:00:00 2001
From: Thiago Macieira <thiago.macieira@nokia.com>
Date: Sat, 19 Mar 2011 11:03:50 +0100
Subject: Improve a little more the core loop and propagate to the other code

Currently are that the "improved SSE2" version and the SSE4.1 version
are yielding the best results, within 1% of each other. These results
are around 20% better than the Qt 4.7 code.
---
 tests/benchmarks/corelib/tools/qstring/main.cpp | 40 ++++++++++++++-----------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/tests/benchmarks/corelib/tools/qstring/main.cpp b/tests/benchmarks/corelib/tools/qstring/main.cpp
index 4a03e5a..f2d6de7 100644
--- a/tests/benchmarks/corelib/tools/qstring/main.cpp
+++ b/tests/benchmarks/corelib/tools/qstring/main.cpp
@@ -1517,7 +1517,8 @@ void fromLatin1_sse2_improved(ushort *dst, const char *str, int size)
 {
     const __m128i nullMask = _mm_set1_epi32(0);
     qptrdiff counter = 0;
-    while (size - counter >= 16) {
+    size -= 16;
+    while (size >= counter) {
         const __m128i chunk = _mm_loadu_si128((__m128i*)(str + counter)); // load
 
         // unpack the first 8 bytes, padding with zeros
@@ -1530,6 +1531,7 @@ void fromLatin1_sse2_improved(ushort *dst, const char *str, int size)
 
         counter += 16;
     }
+    size += 16;
     fromLatin1_epilog(dst + counter, str + counter, size - counter);
 }
 
@@ -1584,7 +1586,7 @@ void fromLatin1_prolog_sse2_overcommit(ushort *dst, const char *str, int)
 template<FromLatin1Function prologFunction>
 void fromLatin1_sse2_withprolog(ushort *dst, const char *str, int size)
 {
-    // same as the Qt 4.7 code, but we attempt to align at the prolog
+    // same as the improved code, but we attempt to align at the prolog
     // therefore, we issue aligned stores
 
     if (size >= 16) {
@@ -1599,43 +1601,45 @@ void fromLatin1_sse2_withprolog(ushort *dst, const char *str, int size)
     }
 
     const __m128i nullMask = _mm_set1_epi32(0);
-    while (size >= 16) {
-        const __m128i chunk = _mm_loadu_si128((__m128i*)str); // load
+    qptrdiff counter = 0;
+    size -= 16;
+    while (size >= counter) {
+        const __m128i chunk = _mm_loadu_si128((__m128i*)(str + counter)); // load
 
         // unpack the first 8 bytes, padding with zeros
         const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask);
-        _mm_store_si128((__m128i*)dst, firstHalf); // store
+        _mm_store_si128((__m128i*)(dst + counter), firstHalf); // store
 
         // unpack the last 8 bytes, padding with zeros
         const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask);
-        _mm_store_si128((__m128i*)(dst + 8), secondHalf); // store
+        _mm_store_si128((__m128i*)(dst + counter + 8), secondHalf); // store
 
-        str += 16;
-        dst += 16;
-        size -= 16;
+        counter += 16;
     }
-    fromLatin1_epilog(dst, str, size);
+    size += 16;
+    fromLatin1_epilog(dst + counter, str + counter, size - counter);
 }
 
 void fromLatin1_sse4_pmovzxbw(ushort *dst, const char *str, int size)
 {
-    while (size >= 16) {
-        __m128i chunk = _mm_loadu_si128((__m128i*)str); // load
+    qptrdiff counter = 0;
+    size -= 16;
+    while (size >= counter) {
+        __m128i chunk = _mm_loadu_si128((__m128i*)(str + counter)); // load
 
         // unpack the first 8 bytes, padding with zeros
         const __m128i firstHalf = _mm_cvtepu8_epi16(chunk);
-        _mm_storeu_si128((__m128i*)dst, firstHalf); // store
+        _mm_storeu_si128((__m128i*)(dst + counter), firstHalf); // store
 
         // unpack the last 8 bytes, padding with zeros
         chunk = _mm_srli_si128(chunk, 8);
         const __m128i secondHalf = _mm_cvtepu8_epi16(chunk);
-        _mm_storeu_si128((__m128i*)(dst + 8), secondHalf); // store
+        _mm_storeu_si128((__m128i*)(dst + counter + 8), secondHalf); // store
 
-        str += 16;
-        dst += 16;
-        size -= 16;
+        counter += 16;
     }
-    fromLatin1_epilog(dst, str, size);
+    size += 16;
+    fromLatin1_epilog(dst + counter, str + counter, size - counter);
 }
 
 void fromLatin1_prolog_sse4_overcommit(ushort *dst, const char *str, int)
-- 
cgit v0.12