summaryrefslogtreecommitdiffstats
path: root/src/corelib
diff options
context:
space:
mode:
authorThiago Macieira <thiago.macieira@nokia.com>2010-12-22 13:42:33 (GMT)
committerThiago Macieira <thiago.macieira@nokia.com>2011-06-14 09:08:58 (GMT)
commitbdad106358ae177d1345f5ff85c0e38cfeb5ca90 (patch)
treee8bed55d682fafccc645e28a980bf9a91074ea22 /src/corelib
parent8680ced782c5e225b2e15c50c05493a23410b119 (diff)
downloadQt-bdad106358ae177d1345f5ff85c0e38cfeb5ca90.zip
Qt-bdad106358ae177d1345f5ff85c0e38cfeb5ca90.tar.gz
Qt-bdad106358ae177d1345f5ff85c0e38cfeb5ca90.tar.bz2
Improve toLatin1 x86 SIMD by using a new SSE4.1 instruction
The new instruction is PBLENDVB, which creates a result by selecting bytes from one of two registers, depending on whether the mask contains a 1 (0xff) or a zero. The SSE2 code requires three instructions (and, andnot, or). The equivalent Neon instruction is VBSL (bit select). Reviewed-by: Samuel Rødal
Diffstat (limited to 'src/corelib')
-rw-r--r--src/corelib/tools/qstring.cpp9
1 files changed, 9 insertions, 0 deletions
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp
index 8ad4e70..7cbef98 100644
--- a/src/corelib/tools/qstring.cpp
+++ b/src/corelib/tools/qstring.cpp
@@ -3579,6 +3579,10 @@ static QByteArray toLatin1_helper(const QChar *data, int length)
const __m128i signedChunk = _mm_add_epi16(chunk1, signedBitOffset);
const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask);
+#ifdef __SSE4_1__
+ chunk1 = _mm_blendv_epi8(chunk1, questionMark, offLimitMask);
+#else
+
// offLimitQuestionMark contains '?' for each 16 bits that was off-limit
// the 16 bits that were correct contains zeros
const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark);
@@ -3589,6 +3593,7 @@ static QByteArray toLatin1_helper(const QChar *data, int length)
// merge offLimitQuestionMark and correctBytes to have the result
chunk1 = _mm_or_si128(correctBytes, offLimitQuestionMark);
+#endif
}
__m128i chunk2 = _mm_loadu_si128((__m128i*)src); // load
@@ -3597,9 +3602,13 @@ static QByteArray toLatin1_helper(const QChar *data, int length)
// exactly the same operations as for the previous chunk of data
const __m128i signedChunk = _mm_add_epi16(chunk2, signedBitOffset);
const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask);
+#ifdef __SSE4_1__
+ chunk2 = _mm_blendv_epi8(chunk2, questionMark, offLimitMask);
+#else
const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark);
const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk2);
chunk2 = _mm_or_si128(correctBytes, offLimitQuestionMark);
+#endif
}
// pack the two vector to 16 x 8bits elements