summaryrefslogtreecommitdiffstats
path: root/src/corelib/tools
diff options
context:
space:
mode:
Diffstat (limited to 'src/corelib/tools')
-rw-r--r--src/corelib/tools/qsimd.cpp191
-rw-r--r--src/corelib/tools/qsimd_p.h1
-rw-r--r--src/corelib/tools/qstring.cpp63
-rw-r--r--src/corelib/tools/qstring.h1
4 files changed, 201 insertions, 55 deletions
diff --git a/src/corelib/tools/qsimd.cpp b/src/corelib/tools/qsimd.cpp
index 8005d7d..68ab033 100644
--- a/src/corelib/tools/qsimd.cpp
+++ b/src/corelib/tools/qsimd.cpp
@@ -41,6 +41,7 @@
#include "qsimd_p.h"
#include <QByteArray>
+#include <stdio.h>
#if defined(Q_OS_WINCE)
#include <windows.h>
@@ -50,15 +51,20 @@
#include <intrin.h>
#endif
+#if defined(Q_OS_LINUX) && defined(__arm__)
+#include "private/qcore_unix_p.h"
+
+#include <asm/hwcap.h>
+#include <linux/auxvec.h>
+#endif
+
QT_BEGIN_NAMESPACE
-uint qDetectCPUFeatures()
+#if defined (Q_OS_WINCE)
+static inline uint detectProcessorFeatures()
{
- static uint features = 0xffffffff;
- if (features != 0xffffffff)
- return features;
+ uint features = 0;
-#if defined (Q_OS_WINCE)
#if defined (ARM)
if (IsProcessorFeaturePresent(PF_ARM_INTEL_WMMX)) {
features = IWMMXT;
@@ -78,22 +84,57 @@ uint qDetectCPUFeatures()
#endif
features = 0;
return features;
-#elif defined(QT_HAVE_IWMMXT)
+}
+
+#elif defined(__arm__) || defined(__arm) || defined(QT_HAVE_IWMMXT) || defined(QT_HAVE_NEON)
+static inline uint detectProcessorFeatures()
+{
+ uint features = 0;
+
+#if defined(Q_OS_LINUX)
+ int auxv = ::qt_safe_open("/proc/self/auxv", O_RDONLY);
+ if (auxv != -1) {
+ unsigned long vector[64];
+ int nread;
+ while (features == 0) {
+ nread = ::qt_safe_read(auxv, (char *)vector, sizeof vector);
+ if (nread <= 0) {
+ // EOF or error
+ break;
+ }
+
+ int max = nread / (sizeof vector[0]);
+ for (int i = 0; i < max; i += 2)
+ if (vector[i] == AT_HWCAP) {
+ if (vector[i+1] & HWCAP_IWMMXT)
+ features |= IWMMXT;
+ if (vector[i+1] & HWCAP_NEON)
+ features |= NEON;
+ break;
+ }
+ }
+
+ ::qt_safe_close(auxv);
+ return features;
+ }
+ // fall back if /proc/self/auxv wasn't found
+#endif
+
+#if defined(QT_HAVE_IWMMXT)
// runtime detection only available when running as a previlegied process
- static const bool doIWMMXT = !qgetenv("QT_NO_IWMMXT").toInt();
- features = doIWMMXT ? IWMMXT : 0;
- return features;
+ features = IWMMXT;
#elif defined(QT_HAVE_NEON)
- static const bool doNEON = !qgetenv("QT_NO_NEON").toInt();
- features = doNEON ? NEON : 0;
+ features = NEON;
+#endif
+
return features;
-#else
- features = 0;
-#if defined(__x86_64__) || defined(Q_OS_WIN64)
- features = MMX|SSE|SSE2|CMOV;
-#elif defined(__ia64__)
- features = MMX|SSE|SSE2;
+}
+
#elif defined(__i386__) || defined(_M_IX86)
+static inline uint detectProcessorFeatures()
+{
+ uint features = 0;
+
unsigned int extended_result = 0;
unsigned int feature_result = 0;
uint result = 0;
@@ -149,6 +190,7 @@ uint qDetectCPUFeatures()
:
: "%eax", "%ecx", "%edx"
);
+
#elif defined (Q_OS_WIN)
_asm {
push eax
@@ -210,6 +252,7 @@ uint qDetectCPUFeatures()
}
#endif
+
// result now contains the standard feature bits
if (result & (1u << 15))
features |= CMOV;
@@ -236,9 +279,13 @@ uint qDetectCPUFeatures()
if (feature_result & (1u << 28))
features |= AVX;
-#endif // i386
+ return features;
+}
-#if defined(__x86_64__) || defined(Q_OS_WIN64)
+#elif defined(__x86_64) || defined(Q_OS_WIN64)
+static inline uint detectProcessorFeatures()
+{
+ uint features = MMX|SSE|SSE2|CMOV;
uint feature_result = 0;
#if defined(Q_CC_GNU)
@@ -282,33 +329,97 @@ uint qDetectCPUFeatures()
features |= SSE4_2;
if (feature_result & (1u << 28))
features |= AVX;
-#endif // x86_64
-#if defined(QT_HAVE_MMX)
- if (qgetenv("QT_NO_MMX").toInt())
- features ^= MMX;
-#endif
- if (qgetenv("QT_NO_MMXEXT").toInt())
- features ^= MMXEXT;
+ return features;
+}
-#if defined(QT_HAVE_3DNOW)
- if (qgetenv("QT_NO_3DNOW").toInt())
- features ^= MMX3DNOW;
-#endif
- if (qgetenv("QT_NO_3DNOWEXT").toInt())
- features ^= MMX3DNOWEXT;
+#elif defined(__ia64__)
+static inline uint detectProcessorFeatures()
+{
+ return MMX|SSE|SSE2;
+}
-#if defined(QT_HAVE_SSE)
- if (qgetenv("QT_NO_SSE").toInt())
- features ^= SSE;
-#endif
-#if defined(QT_HAVE_SSE2)
- if (qgetenv("QT_NO_SSE2").toInt())
- features ^= SSE2;
+#else
+static inline uint detectProcessorFeatures()
+{
+ return 0;
+}
#endif
+/*
+ * Use kdesdk/scripts/generate_string_table.pl to update the table below.
+ * Here's the data (don't forget the ONE leading space):
+ mmx
+ mmxext
+ mmx3dnow
+ mmx3dnowext
+ sse
+ sse2
+ cmov
+ iwmmxt
+ neon
+ sse3
+ ssse3
+ sse4.1
+ sse4.2
+ avx
+ */
+
+// begin generated
+static const char features_string[] =
+ " mmx\0"
+ " mmxext\0"
+ " mmx3dnow\0"
+ " mmx3dnowext\0"
+ " sse\0"
+ " sse2\0"
+ " cmov\0"
+ " iwmmxt\0"
+ " neon\0"
+ " sse3\0"
+ " ssse3\0"
+ " sse4.1\0"
+ " sse4.2\0"
+ " avx\0"
+ "\0";
+
+static const int features_indices[] = {
+ 0, 5, 13, 23, 36, 41, 47, 53,
+ 61, 67, 73, 80, 88, 96, -1
+};
+// end generated
+
+const int features_count = (sizeof features_indices - 1) / (sizeof features_indices[0]);
+
+uint qDetectCPUFeatures()
+{
+ static QBasicAtomicInt features = Q_BASIC_ATOMIC_INITIALIZER(-1);
+ if (features != -1)
+ return features;
+
+ uint f = detectProcessorFeatures();
+ QByteArray disable = qgetenv("QT_NO_CPU_FEATURE");
+ if (!disable.isEmpty()) {
+ disable.prepend(' ');
+ for (int i = 0; i < features_count; ++i) {
+ if (disable.contains(features_string + features_indices[i]))
+ f &= ~(1 << i);
+ }
+ }
+
+ features = f;
return features;
-#endif
+}
+
+void qDumpCPUFeatures()
+{
+ uint features = qDetectCPUFeatures();
+ printf("Processor features: ");
+ for (int i = 0; i < features_count; ++i) {
+ if (features & (1 << i))
+ printf("%s", features_string + features_indices[i]);
+ }
+ puts("");
}
QT_END_NAMESPACE
diff --git a/src/corelib/tools/qsimd_p.h b/src/corelib/tools/qsimd_p.h
index a3148fb..2626657 100644
--- a/src/corelib/tools/qsimd_p.h
+++ b/src/corelib/tools/qsimd_p.h
@@ -85,6 +85,7 @@ QT_BEGIN_HEADER
// SSE4.1 and SSE4.2 intrinsics
#if (defined(QT_HAVE_SSE4_1) || defined(QT_HAVE_SSE4_2)) && (defined(__SSE4_1__) || defined(Q_CC_MSVC))
#include <smmintrin.h>
+#include <nmmintrin.h>
#endif
// AVX intrinsics
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp
index de45a63..bb7105b 100644
--- a/src/corelib/tools/qstring.cpp
+++ b/src/corelib/tools/qstring.cpp
@@ -189,7 +189,7 @@ static int ucstricmp(const ushort *a, const ushort *ae, const uchar *b)
return 1;
}
-// Unicode case-insensitive comparison
+// Unicode case-sensitive comparison
static int ucstrcmp(const QChar *a, int alen, const QChar *b, int blen)
{
if (a == b && alen == blen)
@@ -218,22 +218,55 @@ static int ucstrnicmp(const ushort *a, const ushort *b, int l)
return ucstricmp(a, a + l, b, b + l);
}
+// Benchmarking indicates that doing memcmp is much slower than
+// executing the comparison ourselves.
+//
+// The profiling was done on a population of calls to qMemEquals, generated
+// during a run of the demo browser. The profile of the data (32-bit x86
+// Linux) was:
+//
+// total number of comparisons: 21353
+// longest string compared: 95
+// average comparison length: 14.8786
+// cache-line crosses: 5661 (13.3%)
+// alignment histogram:
+// 0xXXX0 = 512 (1.2%) strings, 0 (0.0%) of which same-aligned
+// 0xXXX2 = 15087 (35.3%) strings, 5145 (34.1%) of which same-aligned
+// 0xXXX4 = 525 (1.2%) strings, 0 (0.0%) of which same-aligned
+// 0xXXX6 = 557 (1.3%) strings, 6 (1.1%) of which same-aligned
+// 0xXXX8 = 509 (1.2%) strings, 0 (0.0%) of which same-aligned
+// 0xXXXa = 24358 (57.0%) strings, 9901 (40.6%) of which same-aligned
+// 0xXXXc = 557 (1.3%) strings, 0 (0.0%) of which same-aligned
+// 0xXXXe = 601 (1.4%) strings, 15 (2.5%) of which same-aligned
+// total = 42706 (100%) strings, 15067 (35.3%) of which same-aligned
+//
+// 92% of the strings have alignment of 2 or 10, which is due to malloc on
+// 32-bit Linux returning values aligned to 8 bytes, and offsetof(array, QString::Data) == 18.
+//
+// The profile on 64-bit will be different since offsetof(array, QString::Data) == 26.
+//
+// The benchmark results were, for a Core-i7 @ 2.67 GHz 32-bit, compiled with -O3 -funroll-loops:
+// 16-bit loads only: 872,301 CPU ticks [Qt 4.5 / memcmp]
+// 32- and 16-bit loads: 773,362 CPU ticks [Qt 4.6]
+// SSE2 "movdqu" 128-bit loads: 618,736 CPU ticks
+// SSE3 "lddqu" 128-bit loads: 619,954 CPU ticks
+// SSSE3 "palignr" corrections: 852,147 CPU ticks
+// SSE4.2 "pcmpestrm": 738,702 CPU ticks
+//
+// The same benchmark on an Atom N450 @ 1.66 GHz, is:
+// 16-bit loads only: 2,185,882 CPU ticks
+// 32- and 16-bit loads: 1,805,060 CPU ticks
+// SSE2 "movdqu" 128-bit loads: 2,529,843 CPU ticks
+// SSE3 "lddqu" 128-bit loads: 2,514,858 CPU ticks
+// SSSE3 "palignr" corrections: 2,160,325 CPU ticks
+// SSE4.2 not available
+//
+// The conclusion we reach is that alignment the SSE2 unaligned code can gain
+// 20% improvement in performance in some systems, but suffers a penalty due
+// to the unaligned loads on others.
+
static bool qMemEquals(const quint16 *a, const quint16 *b, int length)
{
- // Benchmarking indicates that doing memcmp is much slower than
- // executing the comparison ourselves.
- // To make it even faster, we do a 32-bit comparison, comparing
- // twice the amount of data as a normal word-by-word comparison.
- //
- // Benchmarking results on a 2.33 GHz Core2 Duo, with a 64-QChar
- // block of data, with 4194304 iterations (per iteration):
- // operation usec cpu ticks
- // memcmp 330 710
- // 16-bit 79 167-171
- // 32-bit aligned 49 105-109
- //
- // Testing also indicates that unaligned 32-bit loads are as
- // performant as 32-bit aligned.
if (a == b || !length)
return true;
diff --git a/src/corelib/tools/qstring.h b/src/corelib/tools/qstring.h
index 2252353..2633e78 100644
--- a/src/corelib/tools/qstring.h
+++ b/src/corelib/tools/qstring.h
@@ -613,6 +613,7 @@ private:
ushort asciiCache : 1;
ushort capacity : 1;
ushort reserved : 11;
+ // ### Qt5: try to ensure that "array" is aligned to 16 bytes on both 32- and 64-bit
ushort array[1];
};
static Data shared_null;