summaryrefslogtreecommitdiffstats
path: root/src/corelib/tools/qstring.cpp
diff options
context:
space:
mode:
authorBenjamin Poulain <benjamin.poulain@nokia.com>2010-02-25 14:20:22 (GMT)
committerBenjamin Poulain <benjamin.poulain@nokia.com>2010-02-25 14:56:59 (GMT)
commitbadfab1ed209c0f9727980addef7de9083354845 (patch)
tree3eb4d24d973aa2022c264e35a2f650042bec97ab /src/corelib/tools/qstring.cpp
parent8d583adf2373736ca8bb5dd465e17a53b776d85a (diff)
downloadQt-badfab1ed209c0f9727980addef7de9083354845.zip
Qt-badfab1ed209c0f9727980addef7de9083354845.tar.gz
Qt-badfab1ed209c0f9727980addef7de9083354845.tar.bz2
Fix the SIMD implementations of QString::toLatin1()
The SSE implementation used signed integers. This was failing for characters with high value. The Neon implementation was using >= instead of > for creating the mask Reviewed-by: Samuel Rødal
Diffstat (limited to 'src/corelib/tools/qstring.cpp')
-rw-r--r--src/corelib/tools/qstring.cpp20
1 files changed, 13 insertions, 7 deletions
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp
index cce25f4..e9b7b9a 100644
--- a/src/corelib/tools/qstring.cpp
+++ b/src/corelib/tools/qstring.cpp
@@ -3486,13 +3486,17 @@ static QByteArray toLatin1_helper(const QChar *data, int length)
if (length >= 16) {
const int chunkCount = length >> 4; // divided by 16
const __m128i questionMark = _mm_set1_epi16('?');
- const __m128i thresholdMask = _mm_set1_epi16(0xff);
+ // SSE has no compare instruction for unsigned comparison.
+ // The variables must be shiffted + 0x8000 to be compared
+ const __m128i signedBitOffset = _mm_set1_epi16(0x8000);
+ const __m128i thresholdMask = _mm_set1_epi16(0xff + 0x8000);
for (int i = 0; i < chunkCount; ++i) {
__m128i chunk1 = _mm_loadu_si128((__m128i*)src); // load
src += 8;
{
// each 16 bit is equal to 0xFF if the source is outside latin 1 (>0xff)
- const __m128i offLimitMask = _mm_cmpgt_epi16(chunk1, thresholdMask);
+ const __m128i signedChunk = _mm_add_epi16(chunk1, signedBitOffset);
+ const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask);
// offLimitQuestionMark contains '?' for each 16 bits that was off-limit
// the 16 bits that were correct contains zeros
@@ -3510,14 +3514,15 @@ static QByteArray toLatin1_helper(const QChar *data, int length)
src += 8;
{
// exactly the same operations as for the previous chunk of data
- const __m128i offLimitMask = _mm_cmpgt_epi16(chunk2, thresholdMask);
+ const __m128i signedChunk = _mm_add_epi16(chunk2, signedBitOffset);
+ const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask);
const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark);
const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk2);
chunk2 = _mm_or_si128(correctBytes, offLimitQuestionMark);
}
// pack the two vector to 16 x 8bits elements
- const __m128i result = _mm_packs_epi16(chunk1, chunk2);
+ const __m128i result = _mm_packus_epi16(chunk1, chunk2);
_mm_storeu_si128((__m128i*)dst, result); // store
dst += 16;
@@ -3525,9 +3530,10 @@ static QByteArray toLatin1_helper(const QChar *data, int length)
length = length % 16;
}
#elif QT_HAVE_NEON
- // this use eactly the same method as for SSE except the packing
- // which is done to 64 bits (8 x 8bits component).
// Refer to the documentation of the SSE2 implementation
+ // this use eactly the same method as for SSE except:
+ // 1) neon has unsigned comparison
+ // 2) packing is done to 64 bits (8 x 8bits component).
if (length >= 16) {
const int chunkCount = length >> 3; // divided by 8
const uint16x8_t questionMark = vdupq_n_u16('?'); // set
@@ -3536,7 +3542,7 @@ static QByteArray toLatin1_helper(const QChar *data, int length)
uint16x8_t chunk = vld1q_u16((uint16_t *)src); // load
src += 8;
- const uint16x8_t offLimitMask = vcgeq_u16(chunk, thresholdMask); // ==
+ const uint16x8_t offLimitMask = vcgtq_u16(chunk, thresholdMask); // chunk > thresholdMask
const uint16x8_t offLimitQuestionMark = vandq_u16(offLimitMask, questionMark); // offLimitMask & questionMark
const uint16x8_t correctBytes = vbicq_u16(chunk, offLimitMask); // !offLimitMask & chunk
chunk = vorrq_u16(correctBytes, offLimitQuestionMark); // correctBytes | offLimitQuestionMark