Implement toLatin1_helper with Neon

Implement toLatin1 with neon to process 8 characters at a time. Using Neon improve the speed it there is at least 16 characters, there is no improvement when using it for 8 characters. Speed difference on a N900: -8 characters: no change 100% -16 characters: 126% -1000 characters: 361% -10000 characters: 423% Reviewed-by: Samuel Rødal
author: Benjamin Poulain <benjamin.poulain@nokia.com> 2010-02-25 11:33:53 (GMT)
committer: Benjamin Poulain <benjamin.poulain@nokia.com> 2010-02-25 12:32:30 (GMT)
commit: 107a7aed0d64971136637e8ac26bf112108ec3bf (patch)
tree: 5e869f0aec0d63079c1262ec5f4382052ad4c743
parent: dadb99ea2c59d7d0f7a83134b7df5aaaaf80a995 (diff)
download: Qt-107a7aed0d64971136637e8ac26bf112108ec3bf.zip
Qt-107a7aed0d64971136637e8ac26bf112108ec3bf.tar.gz
Qt-107a7aed0d64971136637e8ac26bf112108ec3bf.tar.bz2
1 files changed, 22 insertions, 0 deletions
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp
index 995e4cf..cce25f4 100644
--- a/src/corelib/tools/qstring.cpp
+++ b/src/corelib/tools/qstring.cpp
@@ -3524,6 +3524,28 @@ static QByteArray toLatin1_helper(const QChar *data, int length)
             }
             length = length % 16;
         }
+#elif QT_HAVE_NEON
+        // this use eactly the same method as for SSE except the packing
+        // which is done to 64 bits (8 x 8bits component).
+        // Refer to the documentation of the SSE2 implementation
+        if (length >= 16) {
+            const int chunkCount = length >> 3; // divided by 8
+            const uint16x8_t questionMark = vdupq_n_u16('?'); // set
+            const uint16x8_t thresholdMask = vdupq_n_u16(0xff); // set
+            for (int i = 0; i < chunkCount; ++i) {
+                uint16x8_t chunk = vld1q_u16((uint16_t *)src); // load
+                src += 8;
+
+                const uint16x8_t offLimitMask = vcgeq_u16(chunk, thresholdMask); // ==
+                const uint16x8_t offLimitQuestionMark = vandq_u16(offLimitMask, questionMark); // offLimitMask & questionMark
+                const uint16x8_t correctBytes = vbicq_u16(chunk, offLimitMask); // !offLimitMask & chunk
+                chunk = vorrq_u16(correctBytes, offLimitQuestionMark); // correctBytes | offLimitQuestionMark
+                const uint8x8_t result = vmovn_u16(chunk); // narrowing move->packing
+                vst1_u8(dst, result); // store
+                dst += 8;
+            }
+            length = length % 8;
+        }
 #endif
         while (length--) {
             *dst++ = (*src>0xff) ? '?' : (uchar) *src;
author	Benjamin Poulain <benjamin.poulain@nokia.com>	2010-02-25 11:33:53 (GMT)
committer	Benjamin Poulain <benjamin.poulain@nokia.com>	2010-02-25 12:32:30 (GMT)
commit	107a7aed0d64971136637e8ac26bf112108ec3bf (patch)
tree	5e869f0aec0d63079c1262ec5f4382052ad4c743
parent	dadb99ea2c59d7d0f7a83134b7df5aaaaf80a995 (diff)
download	Qt-107a7aed0d64971136637e8ac26bf112108ec3bf.zip Qt-107a7aed0d64971136637e8ac26bf112108ec3bf.tar.gz Qt-107a7aed0d64971136637e8ac26bf112108ec3bf.tar.bz2