diff options
author | Thiago Macieira <thiago.macieira@nokia.com> | 2010-08-18 10:47:31 (GMT) |
---|---|---|
committer | Thiago Macieira <thiago.macieira@nokia.com> | 2010-08-24 10:36:42 (GMT) |
commit | 5e8d8f82d38ce5a1b30d5d90ecb6bc096d52f4d8 (patch) | |
tree | cf64d0e77eaf3f310298eb63e3702627c2eaf3a8 /tests/benchmarks/corelib/tools | |
parent | 4f891889118d4bcc417382a0a646f3683c621b10 (diff) | |
download | Qt-5e8d8f82d38ce5a1b30d5d90ecb6bc096d52f4d8.zip Qt-5e8d8f82d38ce5a1b30d5d90ecb6bc096d52f4d8.tar.gz Qt-5e8d8f82d38ce5a1b30d5d90ecb6bc096d52f4d8.tar.bz2 |
Update the SSSE3-with-alignment function to use aligned loads.
This results in no change on the Core-i7, but another 2.6% on the Atom
(so it's now 8% better than 4-byte loads and 31% better than current
code)
Diffstat (limited to 'tests/benchmarks/corelib/tools')
-rw-r--r-- | tests/benchmarks/corelib/tools/qstring/main.cpp | 20 |
1 files changed, 11 insertions, 9 deletions
diff --git a/tests/benchmarks/corelib/tools/qstring/main.cpp b/tests/benchmarks/corelib/tools/qstring/main.cpp index d40b9bc..e11d92e 100644 --- a/tests/benchmarks/corelib/tools/qstring/main.cpp +++ b/tests/benchmarks/corelib/tools/qstring/main.cpp @@ -1007,7 +1007,9 @@ static __attribute__((optimize("no-unroll-loops"))) int ucstrncmp_sse2_aligned(c return ucstrncmp_short_tail(a + counter, b + counter, len); } -template<int N> static __attribute__((optimize("no-unroll-loops"))) int ucstrncmp_ssse3_alignr(const ushort *a, const ushort *b, int len) +typedef __m128i (* MMLoadFunction)(const __m128i *); +template<int N, MMLoadFunction LoadFunction = _mm_lddqu_si128> +static inline __attribute__((optimize("no-unroll-loops"))) int ucstrncmp_ssse3_alignr(const ushort *a, const ushort *b, int len) { qptrdiff counter = 0; __m128i lower, upper; @@ -1018,7 +1020,7 @@ template<int N> static __attribute__((optimize("no-unroll-loops"))) int ucstrncm upper = _mm_load_si128((__m128i *)(a + counter) + 1); __m128i merged = _mm_alignr_epi8(upper, lower, N); - __m128i m2 = _mm_lddqu_si128((__m128i *)(b + counter)); + __m128i m2 = LoadFunction((__m128i *)(b + counter)); __m128i cmp = _mm_cmpeq_epi16(merged, m2); ushort mask = ~uint(_mm_movemask_epi8(cmp)); if (mask) { @@ -1092,23 +1094,23 @@ static int ucstrncmp_ssse3_aligning(const ushort *a, const ushort *b, int len) a -= val/2; if (val == 8) - return ucstrncmp_ssse3_alignr<8>(a, b, len); + return ucstrncmp_ssse3_alignr<8, _mm_load_si128>(a, b, len); else if (val == 0) return ucstrncmp_sse2_aligned(a, b, len); if (val < 8) { if (val < 4) - return ucstrncmp_ssse3_alignr<2>(a, b, len); + return ucstrncmp_ssse3_alignr<2, _mm_load_si128>(a, b, len); else if (val == 4) - return ucstrncmp_ssse3_alignr<4>(a, b, len); + return ucstrncmp_ssse3_alignr<4, _mm_load_si128>(a, b, len); else - return ucstrncmp_ssse3_alignr<6>(a, b, len); + return ucstrncmp_ssse3_alignr<6, _mm_load_si128>(a, b, len); } else { if (val < 12) - return ucstrncmp_ssse3_alignr<10>(a, b, len); + return ucstrncmp_ssse3_alignr<10, _mm_load_si128>(a, b, len); else if (val == 12) - return ucstrncmp_ssse3_alignr<12>(a, b, len); + return ucstrncmp_ssse3_alignr<12, _mm_load_si128>(a, b, len); else - return ucstrncmp_ssse3_alignr<14>(a, b, len); + return ucstrncmp_ssse3_alignr<14, _mm_load_si128>(a, b, len); } } |