summaryrefslogtreecommitdiffstats
path: root/tests/benchmarks/corelib/tools
diff options
context:
space:
mode:
authorThiago Macieira <thiago.macieira@nokia.com>2010-08-18 10:47:31 (GMT)
committerThiago Macieira <thiago.macieira@nokia.com>2010-08-24 10:36:42 (GMT)
commit5e8d8f82d38ce5a1b30d5d90ecb6bc096d52f4d8 (patch)
treecf64d0e77eaf3f310298eb63e3702627c2eaf3a8 /tests/benchmarks/corelib/tools
parent4f891889118d4bcc417382a0a646f3683c621b10 (diff)
downloadQt-5e8d8f82d38ce5a1b30d5d90ecb6bc096d52f4d8.zip
Qt-5e8d8f82d38ce5a1b30d5d90ecb6bc096d52f4d8.tar.gz
Qt-5e8d8f82d38ce5a1b30d5d90ecb6bc096d52f4d8.tar.bz2
Update the SSSE3-with-alignment function to use aligned loads.
This results in no change on the Core-i7, but another 2.6% on the Atom (so it's now 8% better than 4-byte loads and 31% better than current code)
Diffstat (limited to 'tests/benchmarks/corelib/tools')
-rw-r--r--tests/benchmarks/corelib/tools/qstring/main.cpp20
1 files changed, 11 insertions, 9 deletions
diff --git a/tests/benchmarks/corelib/tools/qstring/main.cpp b/tests/benchmarks/corelib/tools/qstring/main.cpp
index d40b9bc..e11d92e 100644
--- a/tests/benchmarks/corelib/tools/qstring/main.cpp
+++ b/tests/benchmarks/corelib/tools/qstring/main.cpp
@@ -1007,7 +1007,9 @@ static __attribute__((optimize("no-unroll-loops"))) int ucstrncmp_sse2_aligned(c
return ucstrncmp_short_tail(a + counter, b + counter, len);
}
-template<int N> static __attribute__((optimize("no-unroll-loops"))) int ucstrncmp_ssse3_alignr(const ushort *a, const ushort *b, int len)
+typedef __m128i (* MMLoadFunction)(const __m128i *);
+template<int N, MMLoadFunction LoadFunction = _mm_lddqu_si128>
+static inline __attribute__((optimize("no-unroll-loops"))) int ucstrncmp_ssse3_alignr(const ushort *a, const ushort *b, int len)
{
qptrdiff counter = 0;
__m128i lower, upper;
@@ -1018,7 +1020,7 @@ template<int N> static __attribute__((optimize("no-unroll-loops"))) int ucstrncm
upper = _mm_load_si128((__m128i *)(a + counter) + 1);
__m128i merged = _mm_alignr_epi8(upper, lower, N);
- __m128i m2 = _mm_lddqu_si128((__m128i *)(b + counter));
+ __m128i m2 = LoadFunction((__m128i *)(b + counter));
__m128i cmp = _mm_cmpeq_epi16(merged, m2);
ushort mask = ~uint(_mm_movemask_epi8(cmp));
if (mask) {
@@ -1092,23 +1094,23 @@ static int ucstrncmp_ssse3_aligning(const ushort *a, const ushort *b, int len)
a -= val/2;
if (val == 8)
- return ucstrncmp_ssse3_alignr<8>(a, b, len);
+ return ucstrncmp_ssse3_alignr<8, _mm_load_si128>(a, b, len);
else if (val == 0)
return ucstrncmp_sse2_aligned(a, b, len);
if (val < 8) {
if (val < 4)
- return ucstrncmp_ssse3_alignr<2>(a, b, len);
+ return ucstrncmp_ssse3_alignr<2, _mm_load_si128>(a, b, len);
else if (val == 4)
- return ucstrncmp_ssse3_alignr<4>(a, b, len);
+ return ucstrncmp_ssse3_alignr<4, _mm_load_si128>(a, b, len);
else
- return ucstrncmp_ssse3_alignr<6>(a, b, len);
+ return ucstrncmp_ssse3_alignr<6, _mm_load_si128>(a, b, len);
} else {
if (val < 12)
- return ucstrncmp_ssse3_alignr<10>(a, b, len);
+ return ucstrncmp_ssse3_alignr<10, _mm_load_si128>(a, b, len);
else if (val == 12)
- return ucstrncmp_ssse3_alignr<12>(a, b, len);
+ return ucstrncmp_ssse3_alignr<12, _mm_load_si128>(a, b, len);
else
- return ucstrncmp_ssse3_alignr<14>(a, b, len);
+ return ucstrncmp_ssse3_alignr<14, _mm_load_si128>(a, b, len);
}
}