Fix the SIMD implementations of QString::toLatin1()

The SSE implementation used signed integers. This was failing for characters with high value. The Neon implementation was using >= instead of > for creating the mask Reviewed-by: Samuel Rødal
author: Benjamin Poulain <benjamin.poulain@nokia.com> 2010-02-25 14:20:22 (GMT)
committer: Benjamin Poulain <benjamin.poulain@nokia.com> 2010-02-25 14:56:59 (GMT)
commit: badfab1ed209c0f9727980addef7de9083354845 (patch)
tree: 3eb4d24d973aa2022c264e35a2f650042bec97ab /src/corelib/tools/qstring.cpp
parent: 8d583adf2373736ca8bb5dd465e17a53b776d85a (diff)
download: Qt-badfab1ed209c0f9727980addef7de9083354845.zip
Qt-badfab1ed209c0f9727980addef7de9083354845.tar.gz
Qt-badfab1ed209c0f9727980addef7de9083354845.tar.bz2
1 files changed, 13 insertions, 7 deletions
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp
index cce25f4..e9b7b9a 100644
--- a/src/corelib/tools/qstring.cpp
+++ b/src/corelib/tools/qstring.cpp
@@ -3486,13 +3486,17 @@ static QByteArray toLatin1_helper(const QChar *data, int length)
         if (length >= 16) {
             const int chunkCount = length >> 4; // divided by 16
             const __m128i questionMark = _mm_set1_epi16('?');
-            const __m128i thresholdMask = _mm_set1_epi16(0xff);
+            // SSE has no compare instruction for unsigned comparison.
+            // The variables must be shiffted + 0x8000 to be compared
+            const __m128i signedBitOffset = _mm_set1_epi16(0x8000);
+            const __m128i thresholdMask = _mm_set1_epi16(0xff + 0x8000);
             for (int i = 0; i < chunkCount; ++i) {
                 __m128i chunk1 = _mm_loadu_si128((__m128i*)src); // load
                 src += 8;
                 {
                     // each 16 bit is equal to 0xFF if the source is outside latin 1 (>0xff)
-                    const __m128i offLimitMask = _mm_cmpgt_epi16(chunk1, thresholdMask);
+                    const __m128i signedChunk = _mm_add_epi16(chunk1, signedBitOffset);
+                    const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask);
 
                     // offLimitQuestionMark contains '?' for each 16 bits that was off-limit
                     // the 16 bits that were correct contains zeros
@@ -3510,14 +3514,15 @@ static QByteArray toLatin1_helper(const QChar *data, int length)
                 src += 8;
                 {
                     // exactly the same operations as for the previous chunk of data
-                    const __m128i offLimitMask = _mm_cmpgt_epi16(chunk2, thresholdMask);
+                    const __m128i signedChunk = _mm_add_epi16(chunk2, signedBitOffset);
+                    const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask);
                     const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark);
                     const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk2);
                     chunk2 = _mm_or_si128(correctBytes, offLimitQuestionMark);
                 }
 
                 // pack the two vector to 16 x 8bits elements
-                const __m128i result = _mm_packs_epi16(chunk1, chunk2);
+                const __m128i result = _mm_packus_epi16(chunk1, chunk2);
 
                 _mm_storeu_si128((__m128i*)dst, result); // store
                 dst += 16;
@@ -3525,9 +3530,10 @@ static QByteArray toLatin1_helper(const QChar *data, int length)
             length = length % 16;
         }
 #elif QT_HAVE_NEON
-        // this use eactly the same method as for SSE except the packing
-        // which is done to 64 bits (8 x 8bits component).
         // Refer to the documentation of the SSE2 implementation
+        // this use eactly the same method as for SSE except:
+        // 1) neon has unsigned comparison
+        // 2) packing is done to 64 bits (8 x 8bits component).
         if (length >= 16) {
             const int chunkCount = length >> 3; // divided by 8
             const uint16x8_t questionMark = vdupq_n_u16('?'); // set
@@ -3536,7 +3542,7 @@ static QByteArray toLatin1_helper(const QChar *data, int length)
                 uint16x8_t chunk = vld1q_u16((uint16_t *)src); // load
                 src += 8;
 
-                const uint16x8_t offLimitMask = vcgeq_u16(chunk, thresholdMask); // ==
+                const uint16x8_t offLimitMask = vcgtq_u16(chunk, thresholdMask); // chunk > thresholdMask
                 const uint16x8_t offLimitQuestionMark = vandq_u16(offLimitMask, questionMark); // offLimitMask & questionMark
                 const uint16x8_t correctBytes = vbicq_u16(chunk, offLimitMask); // !offLimitMask & chunk
                 chunk = vorrq_u16(correctBytes, offLimitQuestionMark); // correctBytes | offLimitQuestionMark
author	Benjamin Poulain <benjamin.poulain@nokia.com>	2010-02-25 14:20:22 (GMT)
committer	Benjamin Poulain <benjamin.poulain@nokia.com>	2010-02-25 14:56:59 (GMT)
commit	badfab1ed209c0f9727980addef7de9083354845 (patch)
tree	3eb4d24d973aa2022c264e35a2f650042bec97ab /src/corelib/tools/qstring.cpp
parent	8d583adf2373736ca8bb5dd465e17a53b776d85a (diff)
download	Qt-badfab1ed209c0f9727980addef7de9083354845.zip Qt-badfab1ed209c0f9727980addef7de9083354845.tar.gz Qt-badfab1ed209c0f9727980addef7de9083354845.tar.bz2