summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichał Górny <mgorny@gentoo.org>2017-10-24 06:54:19 (GMT)
committerBenjamin Peterson <benjamin@python.org>2017-10-24 06:54:19 (GMT)
commit1aa00ff383c43335e4a5044274617dbf59bc839e (patch)
tree7a9435e859119e00b0159268a8456a2d7a7db6ea
parent3b66ebe7727dba68c2c6ccf0cd85a4c31255b9b4 (diff)
downloadcpython-1aa00ff383c43335e4a5044274617dbf59bc839e.zip
cpython-1aa00ff383c43335e4a5044274617dbf59bc839e.tar.gz
cpython-1aa00ff383c43335e4a5044274617dbf59bc839e.tar.bz2
fixes bpo-31834: Use optimized code for BLAKE2 only with SSSE3+ (#4066)
Rework the code choosing BLAKE2 code paths from using the optimized variant on all x86_64 machines to using it when SSSE3 or better supported instructions sets are available. Firstly, this solves the problem of using pure SSE2 code path on x86_64 machines. As reported in the bug, this code is slower than the reference code on all tested x86_64 machines. Furthermore, on Athlon64 that lacks SSSE3, it is even 2.5 times slower than the reference code! Checking for SSSE3 therefore ensures that the optimized implementation will only be used when it has a chance of performing better. Secondly, this makes it possible to use SSSE3+ optimizations on 32-bit x86 systems. This allows for even 2 times speed gain on modern 32-bit x86 systems (tested in a 32-bit chroot).
-rw-r--r--Misc/NEWS.d/next/Library/2017-10-23-23-27-52.bpo-31834.InwC6O.rst2
-rw-r--r--Modules/_blake2/blake2b_impl.c4
-rw-r--r--Modules/_blake2/blake2s_impl.c4
-rw-r--r--setup.py9
4 files changed, 8 insertions, 11 deletions
diff --git a/Misc/NEWS.d/next/Library/2017-10-23-23-27-52.bpo-31834.InwC6O.rst b/Misc/NEWS.d/next/Library/2017-10-23-23-27-52.bpo-31834.InwC6O.rst
new file mode 100644
index 0000000..0fe3950
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2017-10-23-23-27-52.bpo-31834.InwC6O.rst
@@ -0,0 +1,2 @@
+Use optimized code for BLAKE2 only with SSSE3+. The pure SSE2 implementation
+is slower than the pure C reference implementation.
diff --git a/Modules/_blake2/blake2b_impl.c b/Modules/_blake2/blake2b_impl.c
index b1ae3e9..3c2a035 100644
--- a/Modules/_blake2/blake2b_impl.c
+++ b/Modules/_blake2/blake2b_impl.c
@@ -26,7 +26,9 @@
#include "impl/blake2.h"
#include "impl/blake2-impl.h" /* for secure_zero_memory() and store48() */
-#ifdef BLAKE2_USE_SSE
+/* pure SSE2 implementation is very slow, so only use the more optimized SSSE3+
+ * https://bugs.python.org/issue31834 */
+#if defined(__SSSE3__) || defined(__SSE4_1__) || defined(__AVX__) || defined(__XOP__)
#include "impl/blake2b.c"
#else
#include "impl/blake2b-ref.c"
diff --git a/Modules/_blake2/blake2s_impl.c b/Modules/_blake2/blake2s_impl.c
index 3615a38..2c56972 100644
--- a/Modules/_blake2/blake2s_impl.c
+++ b/Modules/_blake2/blake2s_impl.c
@@ -26,7 +26,9 @@
#include "impl/blake2.h"
#include "impl/blake2-impl.h" /* for secure_zero_memory() and store48() */
-#ifdef BLAKE2_USE_SSE
+/* pure SSE2 implementation is very slow, so only use the more optimized SSSE3+
+ * https://bugs.python.org/issue31834 */
+#if defined(__SSSE3__) || defined(__SSE4_1__) || defined(__AVX__) || defined(__XOP__)
#include "impl/blake2s.c"
#else
#include "impl/blake2s-ref.c"
diff --git a/setup.py b/setup.py
index 11c4ec6..51e5d7e 100644
--- a/setup.py
+++ b/setup.py
@@ -922,19 +922,10 @@ class PyBuildExt(build_ext):
'Modules/_blake2/impl/*'))
blake2_deps.append('hashlib.h')
- blake2_macros = []
- if (not cross_compiling and
- os.uname().machine == "x86_64" and
- sys.maxsize > 2**32):
- # Every x86_64 machine has at least SSE2. Check for sys.maxsize
- # in case that kernel is 64-bit but userspace is 32-bit.
- blake2_macros.append(('BLAKE2_USE_SSE', '1'))
-
exts.append( Extension('_blake2',
['_blake2/blake2module.c',
'_blake2/blake2b_impl.c',
'_blake2/blake2s_impl.c'],
- define_macros=blake2_macros,
depends=blake2_deps) )
sha3_deps = glob(os.path.join(os.getcwd(), srcdir,