diff options
author | Thiago Macieira <thiago.macieira@nokia.com> | 2011-03-22 13:14:58 (GMT) |
---|---|---|
committer | Thiago Macieira <thiago.macieira@nokia.com> | 2011-03-22 14:52:04 (GMT) |
commit | 835108c44c0f5263856cb96c257e11600ffbf9e6 (patch) | |
tree | 8d8318c12831ced6b964af42cb4dedbbaee1f5c9 /tests/benchmarks/corelib | |
parent | 220658198238ccdede7fb933c16c7119dcb6863b (diff) | |
download | Qt-835108c44c0f5263856cb96c257e11600ffbf9e6.zip Qt-835108c44c0f5263856cb96c257e11600ffbf9e6.tar.gz Qt-835108c44c0f5263856cb96c257e11600ffbf9e6.tar.bz2 |
Add ARM Neon versions of fromLatin1 and fromUtf8
The fromLatin1 code is very simple, yet the handwritten assembly
performs better due to the use of post-increments.
The fromUtf8 code has two alternatives. Neon lacks an instruction
similar to SSE2's _mm_movemask_epi8 (PMOVMSKB) which extracts one bit
from each byte and stores it in a register. We used that in the UTF-8
code to detect bytes with the highest bit set. To compensate, we used
two alternatives:
1) AND the comparison result with a vector containing {128, 64, ...,1 }
Do 3 parallel-adds (VPADD.I8), which will make the mask propagate
to the lowest component in the vector.
Trick found in: http://hilbert-space.de/?p=22 (comment 16-17)
2) Extract the two words from the doubleword Neon register and do the
work in ARM assembly.
It looks like the latter version is performing better.
Diffstat (limited to 'tests/benchmarks/corelib')
-rw-r--r-- | tests/benchmarks/corelib/tools/qstring/main.cpp | 192 |
1 files changed, 192 insertions, 0 deletions
diff --git a/tests/benchmarks/corelib/tools/qstring/main.cpp b/tests/benchmarks/corelib/tools/qstring/main.cpp index 0e34fd7..0074a2e 100644 --- a/tests/benchmarks/corelib/tools/qstring/main.cpp +++ b/tests/benchmarks/corelib/tools/qstring/main.cpp @@ -1655,6 +1655,63 @@ void fromLatin1_prolog_sse4_overcommit(ushort *dst, const char *str, int) #endif #endif +#ifdef __ARM_NEON__ +static inline void fromLatin1_epilog(ushort *dst, const char *str, int size) +{ + if (!size) return; + dst[0] = (uchar)str[0]; + if (!--size) return; + dst[1] = (uchar)str[1]; + if (!--size) return; + dst[2] = (uchar)str[2]; + if (!--size) return; + dst[3] = (uchar)str[3]; + if (!--size) return; + dst[4] = (uchar)str[4]; + if (!--size) return; + dst[5] = (uchar)str[5]; + if (!--size) return; + dst[6] = (uchar)str[6]; + if (!--size) return; + dst[7] = (uchar)str[7]; + if (!--size) return; +} + +void fromLatin1_neon_improved(ushort *dst, const char *str, int len) +{ + while (len >= 8) { + // load 8 bytes into one doubleword Neon register + const uint8x8_t chunk = vld1_u8((uint8_t *)str); + str += 8; + + // expand 8 bytes into 16 bytes in a quadword register + const uint16x8_t expanded = vmovl_u8(chunk); + vst1q_u16(dst, expanded); // store + dst += 8; + + len -= 8; + } + fromLatin1_epilog(dst, str, len); +} + +void fromLatin1_neon_handwritten(ushort *dst, const char *str, int len) +{ + // same as above, but handwritten Neon + while (len >= 8) { + uint16x8_t chunk; + asm ( + "vld1.8 %[chunk], [%[str]]!\n" + "vmovl.u8 %q[chunk], %[chunk]\n" + "vst1.16 %h[chunk], [%[dst]]!\n" + : [dst] "+r" (dst), + [str] "+r" (str), + [chunk] "=w" (chunk)); + len -= 8; + } + + fromLatin1_epilog(dst, str, len); +} +#endif void tst_QString::fromLatin1Alternatives_data() const { @@ -1672,6 +1729,10 @@ void tst_QString::fromLatin1Alternatives_data() const QTest::newRow("sse4-pmovzxbw") << &fromLatin1_sse4_pmovzxbw; #endif #endif +#ifdef __ARM_NEON__ + QTest::newRow("neon-improved") << &fromLatin1_neon_improved; + QTest::newRow("neon-handwritten") << &fromLatin1_neon_handwritten; +#endif } extern StringData fromLatin1Data; @@ -2210,6 +2271,130 @@ int fromUtf8_sse2_trusted_no_bom(ushort *qch, const char *chars, int len) } #endif +#ifdef __ARM_NEON__ +int fromUtf8_latin1_neon(ushort *dst, const char *chars, int len) +{ + fromLatin1_neon_improved(dst, chars, len); + return len; +} + +int fromUtf8_neon(ushort *qch, const char *chars, int len) +{ + if (len > 3 + && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) { + // starts with a byte order mark + chars += 3; + len -= 3; + } + + ushort *dst = qch; + const uint8x8_t highBit = vdup_n_u8(0x80); + const uint8x8_t bitMask = { 128, 64, 32, 16, 8, 4, 2, 1 }; + while (len >= 8) { + // load 8 bytes into one doubleword Neon register + const uint8x8_t chunk = vld1_u8((uint8_t *)chars); + const uint16x8_t expanded = vmovl_u8(chunk); + vst1q_u16(dst, expanded); + + uint8x8_t highBits = vtst_u8(chunk, highBit); + highBits = vand_u8(highBits, bitMask); + highBits = vpadd_u8(highBits, highBits); + highBits = vpadd_u8(highBits, highBits); + highBits = vpadd_u8(highBits, highBits); + + int mask = vget_lane_u8(highBits, 0); + + // find the first bit set in mask + // sets pos to 32 if no bits are found + qptrdiff pos; + asm ("rbit %0, %1\n" + "clz %0, %0" + : "=r" (pos) : "r" (mask)); + + if (__builtin_expect(pos > 8, 1)) { + chars += 8; + dst += 8; + len -= 8; + } else { + // UTF-8 character found + // which one? + + extract_utf8_multibyte<false>(dst, chars, pos, len); + chars += pos; + dst += pos; + len -= pos; + } + } + + qptrdiff counter = 0; + while (counter < len) { + uchar ch = chars[counter]; + if ((ch & 0x80) == 0) { + dst[counter] = ch; + ++counter; + continue; + } + + // UTF-8 character found + extract_utf8_multibyte<false>(dst, chars, counter, len); + } + return dst + counter - qch; +} + +int fromUtf8_neon_trusted(ushort *qch, const char *chars, int len) +{ + ushort *dst = qch; + const uint8x8_t highBit = vdup_n_u8(0x80); + while (len >= 8) { + // load 8 bytes into one doubleword Neon register + const uint8x8_t chunk = vld1_u8((uint8_t *)chars); + const uint16x8_t expanded = vmovl_u8(chunk); + vst1q_u16(dst, expanded); + + uint8x8_t highBits = vtst_u8(chunk, highBit); + // we need to find the lowest byte set + int mask_low = vget_lane_u32(vreinterpret_u32_u8(highBits), 0); + int mask_high = vget_lane_u32(vreinterpret_u32_u8(highBits), 1); + + if (__builtin_expect(mask_low == 0 && mask_high == 0, 1)) { + chars += 8; + dst += 8; + len -= 8; + } else { + // UTF-8 character found + // which one? + qptrdiff pos; + asm ("rbit %0, %1\n" + "clz %1, %1\n" + : "=r" (pos) + : "r" (mask_low ? mask_low : mask_high)); + // now mask_low contains the number of leading zeroes + // or the value 32 (0x20) if no zeroes were found + // the number of leading zeroes is 8*pos + pos /= 8; + + extract_utf8_multibyte<true>(dst, chars, pos, len); + chars += pos; + dst += pos; + len -= pos; + } + } + + qptrdiff counter = 0; + while (counter < len) { + uchar ch = chars[counter]; + if ((ch & 0x80) == 0) { + dst[counter] = ch; + ++counter; + continue; + } + + // UTF-8 character found + extract_utf8_multibyte<true>(dst, chars, counter, len); + } + return dst + counter - qch; +} +#endif void tst_QString::fromUtf8Alternatives_data() const { @@ -2222,12 +2407,19 @@ void tst_QString::fromUtf8Alternatives_data() const QTest::newRow("sse2-optimized-for-ascii") << &fromUtf8_sse2_optimised_for_ascii; QTest::newRow("sse2-trusted-no-bom") << &fromUtf8_sse2_trusted_no_bom; #endif +#ifdef __ARM_NEON__ + QTest::newRow("neon") << &fromUtf8_neon; + QTest::newRow("neon-trusted-no-bom") << &fromUtf8_neon_trusted; +#endif QTest::newRow("latin1-generic") << &fromUtf8_latin1_regular; #ifdef __SSE2__ QTest::newRow("latin1-sse2-qt4.7") << &fromUtf8_latin1_qt47; QTest::newRow("latin1-sse2-improved") << &fromUtf8_latin1_sse2_improved; #endif +#ifdef __ARM_NEON__ + QTest::newRow("latin1-neon-improved") << &fromUtf8_latin1_neon; +#endif } extern StringData fromUtf8Data; |