diff options
Diffstat (limited to 'src/corelib/tools/qstring.cpp')
-rw-r--r-- | src/corelib/tools/qstring.cpp | 112 |
1 files changed, 104 insertions, 8 deletions
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp index ac1bee7..e9b7b9a 100644 --- a/src/corelib/tools/qstring.cpp +++ b/src/corelib/tools/qstring.cpp @@ -46,6 +46,7 @@ #include <qtextcodec.h> #endif #include <private/qutfcodec_p.h> +#include "qsimd_p.h" #include <qdatastream.h> #include <qlist.h> #include "qlocale.h" @@ -3479,12 +3480,82 @@ static QByteArray toLatin1_helper(const QChar *data, int length) QByteArray ba; if (length) { ba.resize(length); - const ushort *i = reinterpret_cast<const ushort *>(data); - const ushort *e = i + length; - uchar *s = (uchar*) ba.data(); - while (i != e) { - *s++ = (*i>0xff) ? '?' : (uchar) *i; - ++i; + const ushort *src = reinterpret_cast<const ushort *>(data); + uchar *dst = (uchar*) ba.data(); +#if defined(QT_ALWAYS_HAVE_SSE2) + if (length >= 16) { + const int chunkCount = length >> 4; // divided by 16 + const __m128i questionMark = _mm_set1_epi16('?'); + // SSE has no compare instruction for unsigned comparison. + // The variables must be shiffted + 0x8000 to be compared + const __m128i signedBitOffset = _mm_set1_epi16(0x8000); + const __m128i thresholdMask = _mm_set1_epi16(0xff + 0x8000); + for (int i = 0; i < chunkCount; ++i) { + __m128i chunk1 = _mm_loadu_si128((__m128i*)src); // load + src += 8; + { + // each 16 bit is equal to 0xFF if the source is outside latin 1 (>0xff) + const __m128i signedChunk = _mm_add_epi16(chunk1, signedBitOffset); + const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask); + + // offLimitQuestionMark contains '?' for each 16 bits that was off-limit + // the 16 bits that were correct contains zeros + const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark); + + // correctBytes contains the bytes that were in limit + // the 16 bits that were off limits contains zeros + const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk1); + + // merge offLimitQuestionMark and correctBytes to have the result + chunk1 = _mm_or_si128(correctBytes, offLimitQuestionMark); + } + + __m128i chunk2 = _mm_loadu_si128((__m128i*)src); // load + src += 8; + { + // exactly the same operations as for the previous chunk of data + const __m128i signedChunk = _mm_add_epi16(chunk2, signedBitOffset); + const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask); + const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark); + const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk2); + chunk2 = _mm_or_si128(correctBytes, offLimitQuestionMark); + } + + // pack the two vector to 16 x 8bits elements + const __m128i result = _mm_packus_epi16(chunk1, chunk2); + + _mm_storeu_si128((__m128i*)dst, result); // store + dst += 16; + } + length = length % 16; + } +#elif QT_HAVE_NEON + // Refer to the documentation of the SSE2 implementation + // this use eactly the same method as for SSE except: + // 1) neon has unsigned comparison + // 2) packing is done to 64 bits (8 x 8bits component). + if (length >= 16) { + const int chunkCount = length >> 3; // divided by 8 + const uint16x8_t questionMark = vdupq_n_u16('?'); // set + const uint16x8_t thresholdMask = vdupq_n_u16(0xff); // set + for (int i = 0; i < chunkCount; ++i) { + uint16x8_t chunk = vld1q_u16((uint16_t *)src); // load + src += 8; + + const uint16x8_t offLimitMask = vcgtq_u16(chunk, thresholdMask); // chunk > thresholdMask + const uint16x8_t offLimitQuestionMark = vandq_u16(offLimitMask, questionMark); // offLimitMask & questionMark + const uint16x8_t correctBytes = vbicq_u16(chunk, offLimitMask); // !offLimitMask & chunk + chunk = vorrq_u16(correctBytes, offLimitQuestionMark); // correctBytes | offLimitQuestionMark + const uint8x8_t result = vmovn_u16(chunk); // narrowing move->packing + vst1_u8(dst, result); // store + dst += 8; + } + length = length % 8; + } +#endif + while (length--) { + *dst++ = (*src>0xff) ? '?' : (uchar) *src; + ++src; } } return ba; @@ -3647,10 +3718,35 @@ QString::Data *QString::fromLatin1_helper(const char *str, int size) d->alloc = d->size = size; d->clean = d->asciiCache = d->simpletext = d->righttoleft = d->capacity = 0; d->data = d->array; - ushort *i = d->data; d->array[size] = '\0'; + ushort *dst = d->data; + /* SIMD: + * Unpacking with SSE has been shown to improve performance on recent CPUs + * The same method gives no improvement with NEON. + */ +#if defined(QT_ALWAYS_HAVE_SSE2) + if (size >= 16) { + int chunkCount = size >> 4; // divided by 16 + const __m128i nullMask = _mm_set1_epi32(0); + for (int i = 0; i < chunkCount; ++i) { + const __m128i chunk = _mm_loadu_si128((__m128i*)str); // load + str += 16; + + // unpack the first 8 bytes, padding with zeros + const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask); + _mm_storeu_si128((__m128i*)dst, firstHalf); // store + dst += 8; + + // unpack the last 8 bytes, padding with zeros + const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask); + _mm_storeu_si128((__m128i*)dst, secondHalf); // store + dst += 8; + } + size = size % 16; + } +#endif while (size--) - *i++ = (uchar)*str++; + *dst++ = (uchar)*str++; } return d; } |