diff options
author | Benjamin Poulain <benjamin.poulain@nokia.com> | 2010-02-24 13:22:37 (GMT) |
---|---|---|
committer | Benjamin Poulain <benjamin.poulain@nokia.com> | 2010-02-24 14:08:09 (GMT) |
commit | 3d25c963adadc5b0f8ae6282790aef11b5f6a9cc (patch) | |
tree | 8c6d12efcc4538d82a28f1982e192c6c2564efac | |
parent | 1a0697308313ff13b3b6f4712d540304b6b0e575 (diff) | |
download | Qt-3d25c963adadc5b0f8ae6282790aef11b5f6a9cc.zip Qt-3d25c963adadc5b0f8ae6282790aef11b5f6a9cc.tar.gz Qt-3d25c963adadc5b0f8ae6282790aef11b5f6a9cc.tar.bz2 |
Implement toLatin1_helper with SSE2
Encoding to latin1 can be done 8 characters at a time with SSE2.
The speed difference on a current CPU :
-<16 characters: no change: 100%
-16 characters string: 120%
-1000 characters: 230%
-10000 characters: 322%
Reviewed-by: Samuel Rødal
-rw-r--r-- | src/corelib/tools/qstring.cpp | 54 |
1 files changed, 48 insertions, 6 deletions
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp index f8303bd..995e4cf 100644 --- a/src/corelib/tools/qstring.cpp +++ b/src/corelib/tools/qstring.cpp @@ -3480,12 +3480,54 @@ static QByteArray toLatin1_helper(const QChar *data, int length) QByteArray ba; if (length) { ba.resize(length); - const ushort *i = reinterpret_cast<const ushort *>(data); - const ushort *e = i + length; - uchar *s = (uchar*) ba.data(); - while (i != e) { - *s++ = (*i>0xff) ? '?' : (uchar) *i; - ++i; + const ushort *src = reinterpret_cast<const ushort *>(data); + uchar *dst = (uchar*) ba.data(); +#if defined(QT_ALWAYS_HAVE_SSE2) + if (length >= 16) { + const int chunkCount = length >> 4; // divided by 16 + const __m128i questionMark = _mm_set1_epi16('?'); + const __m128i thresholdMask = _mm_set1_epi16(0xff); + for (int i = 0; i < chunkCount; ++i) { + __m128i chunk1 = _mm_loadu_si128((__m128i*)src); // load + src += 8; + { + // each 16 bit is equal to 0xFF if the source is outside latin 1 (>0xff) + const __m128i offLimitMask = _mm_cmpgt_epi16(chunk1, thresholdMask); + + // offLimitQuestionMark contains '?' for each 16 bits that was off-limit + // the 16 bits that were correct contains zeros + const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark); + + // correctBytes contains the bytes that were in limit + // the 16 bits that were off limits contains zeros + const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk1); + + // merge offLimitQuestionMark and correctBytes to have the result + chunk1 = _mm_or_si128(correctBytes, offLimitQuestionMark); + } + + __m128i chunk2 = _mm_loadu_si128((__m128i*)src); // load + src += 8; + { + // exactly the same operations as for the previous chunk of data + const __m128i offLimitMask = _mm_cmpgt_epi16(chunk2, thresholdMask); + const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark); + const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk2); + chunk2 = _mm_or_si128(correctBytes, offLimitQuestionMark); + } + + // pack the two vector to 16 x 8bits elements + const __m128i result = _mm_packs_epi16(chunk1, chunk2); + + _mm_storeu_si128((__m128i*)dst, result); // store + dst += 16; + } + length = length % 16; + } +#endif + while (length--) { + *dst++ = (*src>0xff) ? '?' : (uchar) *src; + ++src; } } return ba; |