diff options
author | Thiago Macieira <thiago.macieira@nokia.com> | 2010-12-22 13:42:33 (GMT) |
---|---|---|
committer | Thiago Macieira <thiago.macieira@nokia.com> | 2011-06-14 09:08:58 (GMT) |
commit | bdad106358ae177d1345f5ff85c0e38cfeb5ca90 (patch) | |
tree | e8bed55d682fafccc645e28a980bf9a91074ea22 /src/corelib | |
parent | 8680ced782c5e225b2e15c50c05493a23410b119 (diff) | |
download | Qt-bdad106358ae177d1345f5ff85c0e38cfeb5ca90.zip Qt-bdad106358ae177d1345f5ff85c0e38cfeb5ca90.tar.gz Qt-bdad106358ae177d1345f5ff85c0e38cfeb5ca90.tar.bz2 |
Improve toLatin1 x86 SIMD by using a new SSE4.1 instruction
The new instruction is PBLENDVB, which creates a result by selecting
bytes from one of two registers, depending on whether the mask
contains a 1 (0xff) or a zero.
The SSE2 code requires three instructions (and, andnot, or).
The equivalent Neon instruction is VBSL (bit select).
Reviewed-by: Samuel Rødal
Diffstat (limited to 'src/corelib')
-rw-r--r-- | src/corelib/tools/qstring.cpp | 9 |
1 files changed, 9 insertions, 0 deletions
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp index 8ad4e70..7cbef98 100644 --- a/src/corelib/tools/qstring.cpp +++ b/src/corelib/tools/qstring.cpp @@ -3579,6 +3579,10 @@ static QByteArray toLatin1_helper(const QChar *data, int length) const __m128i signedChunk = _mm_add_epi16(chunk1, signedBitOffset); const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask); +#ifdef __SSE4_1__ + chunk1 = _mm_blendv_epi8(chunk1, questionMark, offLimitMask); +#else + // offLimitQuestionMark contains '?' for each 16 bits that was off-limit // the 16 bits that were correct contains zeros const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark); @@ -3589,6 +3593,7 @@ static QByteArray toLatin1_helper(const QChar *data, int length) // merge offLimitQuestionMark and correctBytes to have the result chunk1 = _mm_or_si128(correctBytes, offLimitQuestionMark); +#endif } __m128i chunk2 = _mm_loadu_si128((__m128i*)src); // load @@ -3597,9 +3602,13 @@ static QByteArray toLatin1_helper(const QChar *data, int length) // exactly the same operations as for the previous chunk of data const __m128i signedChunk = _mm_add_epi16(chunk2, signedBitOffset); const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask); +#ifdef __SSE4_1__ + chunk2 = _mm_blendv_epi8(chunk2, questionMark, offLimitMask); +#else const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark); const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk2); chunk2 = _mm_or_si128(correctBytes, offLimitQuestionMark); +#endif } // pack the two vector to 16 x 8bits elements |