Implement toLatin1_helper with SSE2

Encoding to latin1 can be done 8 characters at a time with SSE2. The speed difference on a current CPU : -<16 characters: no change: 100% -16 characters string: 120% -1000 characters: 230% -10000 characters: 322% Reviewed-by: Samuel Rødal
author: Benjamin Poulain <benjamin.poulain@nokia.com> 2010-02-24 13:22:37 (GMT)
committer: Benjamin Poulain <benjamin.poulain@nokia.com> 2010-02-24 14:08:09 (GMT)
commit: 3d25c963adadc5b0f8ae6282790aef11b5f6a9cc (patch)
tree: 8c6d12efcc4538d82a28f1982e192c6c2564efac /src/corelib/tools/qstring.cpp
parent: 1a0697308313ff13b3b6f4712d540304b6b0e575 (diff)
download: Qt-3d25c963adadc5b0f8ae6282790aef11b5f6a9cc.zip
Qt-3d25c963adadc5b0f8ae6282790aef11b5f6a9cc.tar.gz
Qt-3d25c963adadc5b0f8ae6282790aef11b5f6a9cc.tar.bz2
1 files changed, 48 insertions, 6 deletions
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp
index f8303bd..995e4cf 100644
--- a/src/corelib/tools/qstring.cpp
+++ b/src/corelib/tools/qstring.cpp
@@ -3480,12 +3480,54 @@ static QByteArray toLatin1_helper(const QChar *data, int length)
     QByteArray ba;
     if (length) {
         ba.resize(length);
-        const ushort *i = reinterpret_cast<const ushort *>(data);
-        const ushort *e = i + length;
-        uchar *s = (uchar*) ba.data();
-        while (i != e) {
-            *s++ = (*i>0xff) ? '?' : (uchar) *i;
-            ++i;
+        const ushort *src = reinterpret_cast<const ushort *>(data);
+        uchar *dst = (uchar*) ba.data();
+#if defined(QT_ALWAYS_HAVE_SSE2)
+        if (length >= 16) {
+            const int chunkCount = length >> 4; // divided by 16
+            const __m128i questionMark = _mm_set1_epi16('?');
+            const __m128i thresholdMask = _mm_set1_epi16(0xff);
+            for (int i = 0; i < chunkCount; ++i) {
+                __m128i chunk1 = _mm_loadu_si128((__m128i*)src); // load
+                src += 8;
+                {
+                    // each 16 bit is equal to 0xFF if the source is outside latin 1 (>0xff)
+                    const __m128i offLimitMask = _mm_cmpgt_epi16(chunk1, thresholdMask);
+
+                    // offLimitQuestionMark contains '?' for each 16 bits that was off-limit
+                    // the 16 bits that were correct contains zeros
+                    const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark);
+
+                    // correctBytes contains the bytes that were in limit
+                    // the 16 bits that were off limits contains zeros
+                    const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk1);
+
+                    // merge offLimitQuestionMark and correctBytes to have the result
+                    chunk1 = _mm_or_si128(correctBytes, offLimitQuestionMark);
+                }
+
+                __m128i chunk2 = _mm_loadu_si128((__m128i*)src); // load
+                src += 8;
+                {
+                    // exactly the same operations as for the previous chunk of data
+                    const __m128i offLimitMask = _mm_cmpgt_epi16(chunk2, thresholdMask);
+                    const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark);
+                    const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk2);
+                    chunk2 = _mm_or_si128(correctBytes, offLimitQuestionMark);
+                }
+
+                // pack the two vector to 16 x 8bits elements
+                const __m128i result = _mm_packs_epi16(chunk1, chunk2);
+
+                _mm_storeu_si128((__m128i*)dst, result); // store
+                dst += 16;
+            }
+            length = length % 16;
+        }
+#endif
+        while (length--) {
+            *dst++ = (*src>0xff) ? '?' : (uchar) *src;
+            ++src;
         }
     }
     return ba;
author	Benjamin Poulain <benjamin.poulain@nokia.com>	2010-02-24 13:22:37 (GMT)
committer	Benjamin Poulain <benjamin.poulain@nokia.com>	2010-02-24 14:08:09 (GMT)
commit	3d25c963adadc5b0f8ae6282790aef11b5f6a9cc (patch)
tree	8c6d12efcc4538d82a28f1982e192c6c2564efac /src/corelib/tools/qstring.cpp
parent	1a0697308313ff13b3b6f4712d540304b6b0e575 (diff)
download	Qt-3d25c963adadc5b0f8ae6282790aef11b5f6a9cc.zip Qt-3d25c963adadc5b0f8ae6282790aef11b5f6a9cc.tar.gz Qt-3d25c963adadc5b0f8ae6282790aef11b5f6a9cc.tar.bz2