diff options
-rw-r--r-- | Doc/library/sys.rst | 11 | ||||
-rw-r--r-- | Doc/license.rst | 29 | ||||
-rw-r--r-- | Doc/whatsnew/3.4.rst | 1 | ||||
-rw-r--r-- | Include/Python.h | 1 | ||||
-rw-r--r-- | Include/object.h | 17 | ||||
-rw-r--r-- | Include/pyhash.h | 147 | ||||
-rw-r--r-- | Include/pyport.h | 19 | ||||
-rwxr-xr-x | Lib/test/regrtest.py | 2 | ||||
-rw-r--r-- | Lib/test/test_hash.py | 150 | ||||
-rw-r--r-- | Lib/test/test_sys.py | 23 | ||||
-rw-r--r-- | Makefile.pre.in | 2 | ||||
-rw-r--r-- | Misc/ACKS | 1 | ||||
-rw-r--r-- | Misc/NEWS | 3 | ||||
-rw-r--r-- | Modules/pyexpat.c | 2 | ||||
-rw-r--r-- | Objects/bytesobject.c | 2 | ||||
-rw-r--r-- | Objects/memoryobject.c | 2 | ||||
-rw-r--r-- | Objects/object.c | 146 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 35 | ||||
-rw-r--r-- | PCbuild/pythoncore.vcxproj | 2 | ||||
-rw-r--r-- | PCbuild/pythoncore.vcxproj.filters | 6 | ||||
-rw-r--r-- | Python/pyhash.c | 430 | ||||
-rw-r--r-- | Python/pythonrun.c | 3 | ||||
-rw-r--r-- | Python/random.c | 13 | ||||
-rw-r--r-- | Python/sysmodule.c | 20 | ||||
-rwxr-xr-x | configure | 116 | ||||
-rw-r--r-- | configure.ac | 72 | ||||
-rw-r--r-- | pyconfig.h.in | 16 |
27 files changed, 1029 insertions, 242 deletions
diff --git a/Doc/library/sys.rst b/Doc/library/sys.rst index 341764a..b885de8 100644 --- a/Doc/library/sys.rst +++ b/Doc/library/sys.rst @@ -594,9 +594,20 @@ always available. | :const:`imag` | multiplier used for the imaginary part of a | | | complex number | +---------------------+--------------------------------------------------+ + | :const:`algorithm` | name of the algorithm for hashing of str, bytes, | + | | and memoryview | + +---------------------+--------------------------------------------------+ + | :const:`hash_bits` | internal output size of the hash algorithm | + +---------------------+--------------------------------------------------+ + | :const:`seed_bits` | size of the seed key of the hash algorithm | + +---------------------+--------------------------------------------------+ + .. versionadded:: 3.2 + .. versionchanged: 3.4 + Added *algorithm*, *hash_bits* and *seed_bits* + .. data:: hexversion diff --git a/Doc/license.rst b/Doc/license.rst index 598ba86..5e6ed26 100644 --- a/Doc/license.rst +++ b/Doc/license.rst @@ -609,6 +609,35 @@ the following note:: http://creativecommons.org/publicdomain/zero/1.0/ +SipHash24 +--------- + +The file :file:`Python/pyhash.c` contains Marek Majkowski' implementation of +Dan Bernstein's SipHash24 algorithm. The contains the following note:: + + <MIT License> + Copyright (c) 2013 Marek Majkowski <marek@popcount.org> + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + </MIT License> + + Original location: + https://github.com/majek/csiphash/ + + Solution inspired by code from: + Samuel Neves (supercop/crypto_auth/siphash24/little) + djb (supercop/crypto_auth/siphash24/little2) + Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c) + + strtod and dtoa --------------- diff --git a/Doc/whatsnew/3.4.rst b/Doc/whatsnew/3.4.rst index f72f544..19b20bf 100644 --- a/Doc/whatsnew/3.4.rst +++ b/Doc/whatsnew/3.4.rst @@ -116,6 +116,7 @@ CPython implementation improvements: * :ref:`PEP 442: Safe object finalization <pep-442>` * :ref:`PEP 445: Configurable memory allocators <pep-445>` +* :pep:`456` Secure and interchangeable hash algorithm * Improve finalization of Python modules to avoid setting their globals to None, in most cases (:issue:`18214`). * A more efficient :mod:`marshal` format (:issue:`16475`). diff --git a/Include/Python.h b/Include/Python.h index a78a721..2dd8290 100644 --- a/Include/Python.h +++ b/Include/Python.h @@ -68,6 +68,7 @@ #include "object.h" #include "objimpl.h" #include "typeslots.h" +#include "pyhash.h" #include "pydebug.h" diff --git a/Include/object.h b/Include/object.h index a6130fe..8de2208 100644 --- a/Include/object.h +++ b/Include/object.h @@ -562,23 +562,6 @@ PyAPI_FUNC(PyObject *) PyObject_Dir(PyObject *); PyAPI_FUNC(int) Py_ReprEnter(PyObject *); PyAPI_FUNC(void) Py_ReprLeave(PyObject *); -/* Helpers for hash functions */ -#ifndef Py_LIMITED_API -PyAPI_FUNC(Py_hash_t) _Py_HashDouble(double); -PyAPI_FUNC(Py_hash_t) _Py_HashPointer(void*); -PyAPI_FUNC(Py_hash_t) _Py_HashBytes(unsigned char*, Py_ssize_t); -#endif - -typedef struct { - Py_hash_t prefix; - Py_hash_t suffix; -} _Py_HashSecret_t; -PyAPI_DATA(_Py_HashSecret_t) _Py_HashSecret; - -#ifdef Py_DEBUG -PyAPI_DATA(int) _Py_HashSecret_Initialized; -#endif - /* Helper for passing objects to printf and the like */ #define PyObject_REPR(obj) _PyUnicode_AsString(PyObject_Repr(obj)) diff --git a/Include/pyhash.h b/Include/pyhash.h new file mode 100644 index 0000000..83c9281 --- /dev/null +++ b/Include/pyhash.h @@ -0,0 +1,147 @@ +#ifndef Py_HASH_H + +#define Py_HASH_H +#ifdef __cplusplus +extern "C" { +#endif + +/* Helpers for hash functions */ +#ifndef Py_LIMITED_API +PyAPI_FUNC(Py_hash_t) _Py_HashDouble(double); +PyAPI_FUNC(Py_hash_t) _Py_HashPointer(void*); +PyAPI_FUNC(Py_hash_t) _Py_HashBytes(const void*, Py_ssize_t); +#endif + +/* Prime multiplier used in string and various other hashes. */ +#define _PyHASH_MULTIPLIER 1000003UL /* 0xf4243 */ + +/* Parameters used for the numeric hash implementation. See notes for + _Py_HashDouble in Objects/object.c. Numeric hashes are based on + reduction modulo the prime 2**_PyHASH_BITS - 1. */ + +#if SIZEOF_VOID_P >= 8 +# define _PyHASH_BITS 61 +#else +# define _PyHASH_BITS 31 +#endif + +#define _PyHASH_MODULUS (((size_t)1 << _PyHASH_BITS) - 1) +#define _PyHASH_INF 314159 +#define _PyHASH_NAN 0 +#define _PyHASH_IMAG _PyHASH_MULTIPLIER + + +/* hash secret + * + * memory layout on 64 bit systems + * cccccccc cccccccc cccccccc uc -- unsigned char[24] + * pppppppp ssssssss ........ fnv -- two Py_hash_t + * k0k0k0k0 k1k1k1k1 ........ siphash -- two PY_UINT64_T + * ........ ........ ssssssss djbx33a -- 16 bytes padding + one Py_hash_t + * ........ ........ eeeeeeee pyexpat XML hash salt + * + * memory layout on 32 bit systems + * cccccccc cccccccc cccccccc uc + * ppppssss ........ ........ fnv -- two Py_hash_t + * k0k0k0k0 k1k1k1k1 ........ siphash -- two PY_UINT64_T (*) + * ........ ........ ssss.... djbx33a -- 16 bytes padding + one Py_hash_t + * ........ ........ eeee.... pyexpat XML hash salt + * + * (*) The siphash member may not be available on 32 bit platforms without + * an unsigned int64 data type. + */ +typedef union { + /* ensure 24 bytes */ + unsigned char uc[24]; + /* two Py_hash_t for FNV */ + struct { + Py_hash_t prefix; + Py_hash_t suffix; + } fnv; +#ifdef PY_UINT64_T + /* two uint64 for SipHash24 */ + struct { + PY_UINT64_T k0; + PY_UINT64_T k1; + } siphash; +#endif + /* a different (!) Py_hash_t for small string optimization */ + struct { + unsigned char padding[16]; + Py_hash_t suffix; + } djbx33a; + struct { + unsigned char padding[16]; + Py_hash_t hashsalt; + } expat; +} _Py_HashSecret_t; +PyAPI_DATA(_Py_HashSecret_t) _Py_HashSecret; + +#ifdef Py_DEBUG +PyAPI_DATA(int) _Py_HashSecret_Initialized; +#endif + + +/* hash function definition */ +#ifndef Py_LIMITED_API +typedef struct { + Py_hash_t (*const hash)(const void *, Py_ssize_t); + const char *name; + const int hash_bits; + const int seed_bits; +} PyHash_FuncDef; + +PyAPI_FUNC(PyHash_FuncDef*) PyHash_GetFuncDef(void); +#endif + + +/* cutoff for small string DJBX33A optimization in range [1, cutoff). + * + * About 50% of the strings in a typical Python application are smaller than + * 6 to 7 chars. However DJBX33A is vulnerable to hash collision attacks. + * NEVER use DJBX33A for long strings! + * + * A Py_HASH_CUTOFF of 0 disables small string optimization. 32 bit platforms + * should use a smaller cutoff because it is easier to create colliding + * strings. A cutoff of 7 on 64bit platforms and 5 on 32bit platforms should + * provide a decent safety margin. + */ +#ifndef Py_HASH_CUTOFF +# define Py_HASH_CUTOFF 0 +#elif (Py_HASH_CUTOFF > 7 || Py_HASH_CUTOFF < 0) +# error Py_HASH_CUTOFF must in range 0...7. +#endif /* Py_HASH_CUTOFF */ + + +/* hash algorithm selection + * + * The values for Py_HASH_SIPHASH24 and Py_HASH_FNV are hard-coded in the + * configure script. + * + * - FNV is available on all platforms and architectures. + * - SIPHASH24 only works on plaforms that provide PY_UINT64_T and doesn't + * require aligned memory for integers. + * - With EXTERNAL embedders can provide an alternative implementation with:: + * + * PyHash_FuncDef PyHash_Func = {...}; + * + * XXX: Figure out __declspec() for extern PyHash_FuncDef. + */ +#define Py_HASH_EXTERNAL 0 +#define Py_HASH_SIPHASH24 1 +#define Py_HASH_FNV 2 + +#ifndef Py_HASH_ALGORITHM +# if (defined(PY_UINT64_T) && defined(PY_UINT32_T) \ + && !defined(HAVE_ALIGNED_REQUIRED)) +# define Py_HASH_ALGORITHM Py_HASH_SIPHASH24 +# else +# define Py_HASH_ALGORITHM Py_HASH_FNV +# endif /* uint64_t && uint32_t && aligned */ +#endif /* Py_HASH_ALGORITHM */ + +#ifdef __cplusplus +} +#endif + +#endif /* !Py_HASH_H */ diff --git a/Include/pyport.h b/Include/pyport.h index ca20b22..b6b426a 100644 --- a/Include/pyport.h +++ b/Include/pyport.h @@ -144,23 +144,6 @@ Used in: PY_LONG_LONG #endif #endif -/* Prime multiplier used in string and various other hashes. */ -#define _PyHASH_MULTIPLIER 1000003UL /* 0xf4243 */ - -/* Parameters used for the numeric hash implementation. See notes for - _Py_HashDouble in Objects/object.c. Numeric hashes are based on - reduction modulo the prime 2**_PyHASH_BITS - 1. */ - -#if SIZEOF_VOID_P >= 8 -#define _PyHASH_BITS 61 -#else -#define _PyHASH_BITS 31 -#endif -#define _PyHASH_MODULUS (((size_t)1 << _PyHASH_BITS) - 1) -#define _PyHASH_INF 314159 -#define _PyHASH_NAN 0 -#define _PyHASH_IMAG _PyHASH_MULTIPLIER - /* uintptr_t is the C9X name for an unsigned integral type such that a * legitimate void* can be cast to uintptr_t and then back to void* again * without loss of information. Similarly for intptr_t, wrt a signed @@ -199,8 +182,10 @@ typedef Py_intptr_t Py_ssize_t; #endif /* Py_hash_t is the same size as a pointer. */ +#define SIZEOF_PY_HASH_T SIZEOF_SIZE_T typedef Py_ssize_t Py_hash_t; /* Py_uhash_t is the unsigned equivalent needed to calculate numeric hash. */ +#define SIZEOF_PY_UHASH_T SIZEOF_SIZE_T typedef size_t Py_uhash_t; /* Largest possible value of size_t. diff --git a/Lib/test/regrtest.py b/Lib/test/regrtest.py index a5d707e..c1c831f 100755 --- a/Lib/test/regrtest.py +++ b/Lib/test/regrtest.py @@ -601,6 +601,8 @@ def main(tests=None, **kwargs): print("==", platform.python_implementation(), *sys.version.split()) print("== ", platform.platform(aliased=True), "%s-endian" % sys.byteorder) + print("== ", "hash algorithm:", sys.hash_info.algorithm, + "64bit" if sys.maxsize > 2**32 else "32bit") print("== ", os.getcwd()) print("Testing with flags:", sys.flags) diff --git a/Lib/test/test_hash.py b/Lib/test/test_hash.py index e3ab6e4..66e4155 100644 --- a/Lib/test/test_hash.py +++ b/Lib/test/test_hash.py @@ -12,6 +12,40 @@ from collections import Hashable IS_64BIT = sys.maxsize > 2**32 +def lcg(x, length=16): + """Linear congruential generator""" + if x == 0: + return bytes(length) + out = bytearray(length) + for i in range(length): + x = (214013 * x + 2531011) & 0x7fffffff + out[i] = (x >> 16) & 0xff + return bytes(out) + +def pysiphash(uint64): + """Convert SipHash24 output to Py_hash_t + """ + assert 0 <= uint64 < (1 << 64) + # simple unsigned to signed int64 + if uint64 > (1 << 63) - 1: + int64 = uint64 - (1 << 64) + else: + int64 = uint64 + # mangle uint64 to uint32 + uint32 = (uint64 ^ uint64 >> 32) & 0xffffffff + # simple unsigned to signed int32 + if uint32 > (1 << 31) - 1: + int32 = uint32 - (1 << 32) + else: + int32 = uint32 + return int32, int64 + +def skip_unless_internalhash(test): + """Skip decorator for tests that depend on SipHash24 or FNV""" + ok = sys.hash_info.algorithm in {"fnv", "siphash24"} + msg = "Requires SipHash24 or FNV" + return test if ok else unittest.skip(msg)(test) + class HashEqualityTestCase(unittest.TestCase): @@ -138,7 +172,7 @@ class HashRandomizationTests: # an object to be tested def get_hash_command(self, repr_): - return 'print(hash(%s))' % repr_ + return 'print(hash(eval(%s.decode("utf-8"))))' % repr_.encode("utf-8") def get_hash(self, repr_, seed=None): env = os.environ.copy() @@ -161,12 +195,67 @@ class HashRandomizationTests: self.assertNotEqual(run1, run2) class StringlikeHashRandomizationTests(HashRandomizationTests): + repr_ = None + repr_long = None + + # 32bit little, 64bit little, 32bit big, 64bit big + known_hashes = { + 'djba33x': [ # only used for small strings + # seed 0, 'abc' + [193485960, 193485960, 193485960, 193485960], + # seed 42, 'abc' + [-678966196, 573763426263223372, -820489388, -4282905804826039665], + ], + 'siphash24': [ + # seed 0, 'abc' + [2025351752, 4596069200710135518, 1433332804, + -3481057401533226760], + # seed 42, 'abc' + [-774632014, -4501618152524544106, 1054608210, + -1493500025205289231], + # seed 42, 'abcdefghijk' + [-1436007334, 4436719588892876975, -1436007334, + 4436719588892876975], + # seed 0, 'äú∑ℇ', PyUCS2 layout depends on endianess + [1386693832, 5749986484189612790, 1776982909, + -5915111450199468540], + # seed 42, 'äú∑ℇ' + [1260387190, -2947981342227738144, 1430287772, + -4296699217652516017], + ], + 'fnv': [ + # seed 0, 'abc' + [-1600925533, 1453079729188098211, -1600925533, + 1453079729188098211], + # seed 42, 'abc' + [-206076799, -4410911502303878509, -1024014457, + -3570150969479994130], + # seed 42, 'abcdefghijk' + [811136751, -5046230049376118746, -77208053 , + -4779029615281019666], + # seed 0, 'äú∑ℇ' + [44402817, 8998297579845987431, -1956240331, + -782697888614047887], + # seed 42, 'äú∑ℇ' + [-283066365, -4576729883824601543, -271871407, None], + ] + } + + def get_expected_hash(self, position, length): + if length < sys.hash_info.cutoff: + algorithm = "djba33x" + else: + algorithm = sys.hash_info.algorithm + if sys.byteorder == 'little': + platform = 1 if IS_64BIT else 0 + else: + assert(sys.byteorder == 'big') + platform = 3 if IS_64BIT else 2 + return self.known_hashes[algorithm][position][platform] + def test_null_hash(self): # PYTHONHASHSEED=0 disables the randomized hash - if IS_64BIT: - known_hash_of_obj = 1453079729188098211 - else: - known_hash_of_obj = -1600925533 + known_hash_of_obj = self.get_expected_hash(0, 3) # Randomization is enabled by default: self.assertNotEqual(self.get_hash(self.repr_), known_hash_of_obj) @@ -174,39 +263,53 @@ class StringlikeHashRandomizationTests(HashRandomizationTests): # It can also be disabled by setting the seed to 0: self.assertEqual(self.get_hash(self.repr_, seed=0), known_hash_of_obj) + @skip_unless_internalhash def test_fixed_hash(self): # test a fixed seed for the randomized hash # Note that all types share the same values: - if IS_64BIT: - if sys.byteorder == 'little': - h = -4410911502303878509 - else: - h = -3570150969479994130 - else: - if sys.byteorder == 'little': - h = -206076799 - else: - h = -1024014457 + h = self.get_expected_hash(1, 3) self.assertEqual(self.get_hash(self.repr_, seed=42), h) + @skip_unless_internalhash + def test_long_fixed_hash(self): + if self.repr_long is None: + return + h = self.get_expected_hash(2, 11) + self.assertEqual(self.get_hash(self.repr_long, seed=42), h) + + class StrHashRandomizationTests(StringlikeHashRandomizationTests, unittest.TestCase): repr_ = repr('abc') + repr_long = repr('abcdefghijk') + repr_ucs2 = repr('äú∑ℇ') + @skip_unless_internalhash def test_empty_string(self): self.assertEqual(hash(""), 0) + @skip_unless_internalhash + def test_ucs2_string(self): + h = self.get_expected_hash(3, 6) + self.assertEqual(self.get_hash(self.repr_ucs2, seed=0), h) + h = self.get_expected_hash(4, 6) + self.assertEqual(self.get_hash(self.repr_ucs2, seed=42), h) + class BytesHashRandomizationTests(StringlikeHashRandomizationTests, unittest.TestCase): repr_ = repr(b'abc') + repr_long = repr(b'abcdefghijk') + @skip_unless_internalhash def test_empty_string(self): self.assertEqual(hash(b""), 0) class MemoryviewHashRandomizationTests(StringlikeHashRandomizationTests, unittest.TestCase): repr_ = "memoryview(b'abc')" + repr_long = "memoryview(b'abcdefghijk')" + @skip_unless_internalhash def test_empty_string(self): self.assertEqual(hash(memoryview(b"")), 0) @@ -224,5 +327,22 @@ class DatetimeTimeTests(DatetimeTests, unittest.TestCase): repr_ = repr(datetime.time(0)) +class HashDistributionTestCase(unittest.TestCase): + + def test_hash_distribution(self): + # check for hash collision + base = "abcdefghabcdefg" + for i in range(1, len(base)): + prefix = base[:i] + s15 = set() + s255 = set() + for c in range(256): + h = hash(prefix + chr(c)) + s15.add(h & 0xf) + s255.add(h & 0xff) + # SipHash24 distribution depends on key, usually > 60% + self.assertGreater(len(s15), 8, prefix) + self.assertGreater(len(s255), 128, prefix) + if __name__ == "__main__": unittest.main() diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py index 0565f39..f0c7148 100644 --- a/Lib/test/test_sys.py +++ b/Lib/test/test_sys.py @@ -8,6 +8,7 @@ import operator import codecs import gc import sysconfig +import platform # count the number of test runs, used to create unique # strings to intern in test_intern() @@ -431,7 +432,7 @@ class SysModuleTest(unittest.TestCase): self.assertEqual(type(sys.int_info.sizeof_digit), int) self.assertIsInstance(sys.hexversion, int) - self.assertEqual(len(sys.hash_info), 5) + self.assertEqual(len(sys.hash_info), 9) self.assertLess(sys.hash_info.modulus, 2**sys.hash_info.width) # sys.hash_info.modulus should be a prime; we do a quick # probable primality test (doesn't exclude the possibility of @@ -446,6 +447,26 @@ class SysModuleTest(unittest.TestCase): self.assertIsInstance(sys.hash_info.inf, int) self.assertIsInstance(sys.hash_info.nan, int) self.assertIsInstance(sys.hash_info.imag, int) + algo = sysconfig.get_config_var("PY_HASH_ALGORITHM") + if sys.hash_info.algorithm in {"fnv", "siphash24"}: + self.assertIn(sys.hash_info.hash_bits, {32, 64}) + self.assertIn(sys.hash_info.seed_bits, {32, 64, 128}) + + if algo == 1: + self.assertEqual(sys.hash_info.algorithm, "siphash24") + elif algo == 2: + self.assertEqual(sys.hash_info.algorithm, "fnv") + else: + processor = platform.processor().lower() + if processor in {"sparc", "mips"}: + self.assertEqual(sys.hash_info.algorithm, "fnv") + else: + self.assertEqual(sys.hash_info.algorithm, "siphash24") + else: + # PY_HASH_EXTERNAL + self.assertEqual(algo, 0) + self.assertGreaterEqual(sys.hash_info.cutoff, 0) + self.assertLess(sys.hash_info.cutoff, 8) self.assertIsInstance(sys.maxsize, int) self.assertIsInstance(sys.maxunicode, int) diff --git a/Makefile.pre.in b/Makefile.pre.in index eae8d75..33f6f86 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -366,6 +366,7 @@ PYTHON_OBJS= \ Python/pyarena.o \ Python/pyctype.o \ Python/pyfpe.o \ + Python/pyhash.o \ Python/pymath.o \ Python/pystate.o \ Python/pythonrun.o \ @@ -868,6 +869,7 @@ PYTHON_HEADERS= \ $(srcdir)/Include/pydebug.h \ $(srcdir)/Include/pyerrors.h \ $(srcdir)/Include/pyfpe.h \ + $(srcdir)/Include/pyhash.h \ $(srcdir)/Include/pymath.h \ $(srcdir)/Include/pygetopt.h \ $(srcdir)/Include/pymacro.h \ @@ -802,6 +802,7 @@ Nick Maclaren Don MacMillen Tomasz Maćkowiak Steve Majewski +Marek Majkowski Grzegorz Makarewicz David Malcolm Greg Malcolm @@ -10,6 +10,9 @@ Projected release date: 2013-11-24 Core and Builtins ----------------- +- Issue #19183: Implement PEP 456 'secure and interchangeable hash algorithm'. + Python now uses SipHash24 on all major platforms. + - Issue #12892: The utf-16* and utf-32* encoders no longer allow surrogate code points (U+D800-U+DFFF) to be encoded. The utf-32* decoders no longer decode byte sequences that correspond to surrogate code points. The surrogatepass diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c index 156dbf1..e11c153 100644 --- a/Modules/pyexpat.c +++ b/Modules/pyexpat.c @@ -1218,7 +1218,7 @@ newxmlparseobject(char *encoding, char *namespace_separator, PyObject *intern) * has a backport of this feature where we also define XML_HAS_SET_HASH_SALT * to indicate that we can still use it. */ XML_SetHashSalt(self->itself, - (unsigned long)_Py_HashSecret.prefix); + (unsigned long)_Py_HashSecret.expat.hashsalt); #endif XML_SetUserData(self->itself, (void *)self); XML_SetUnknownEncodingHandler(self->itself, diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index efa0192..8217b1e 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -897,7 +897,7 @@ bytes_hash(PyBytesObject *a) { if (a->ob_shash == -1) { /* Can't fail */ - a->ob_shash = _Py_HashBytes((unsigned char *) a->ob_sval, Py_SIZE(a)); + a->ob_shash = _Py_HashBytes(a->ob_sval, Py_SIZE(a)); } return a->ob_shash; } diff --git a/Objects/memoryobject.c b/Objects/memoryobject.c index 1d52d9d..cb644b8 100644 --- a/Objects/memoryobject.c +++ b/Objects/memoryobject.c @@ -2742,7 +2742,7 @@ memory_hash(PyMemoryViewObject *self) } /* Can't fail */ - self->hash = _Py_HashBytes((unsigned char *)mem, view->len); + self->hash = _Py_HashBytes(mem, view->len); if (mem != view->buf) PyMem_Free(mem); diff --git a/Objects/object.c b/Objects/object.c index acc34af..395e28d 100644 --- a/Objects/object.c +++ b/Objects/object.c @@ -731,150 +731,6 @@ PyObject_RichCompareBool(PyObject *v, PyObject *w, int op) return ok; } -/* Set of hash utility functions to help maintaining the invariant that - if a==b then hash(a)==hash(b) - - All the utility functions (_Py_Hash*()) return "-1" to signify an error. -*/ - -/* For numeric types, the hash of a number x is based on the reduction - of x modulo the prime P = 2**_PyHASH_BITS - 1. It's designed so that - hash(x) == hash(y) whenever x and y are numerically equal, even if - x and y have different types. - - A quick summary of the hashing strategy: - - (1) First define the 'reduction of x modulo P' for any rational - number x; this is a standard extension of the usual notion of - reduction modulo P for integers. If x == p/q (written in lowest - terms), the reduction is interpreted as the reduction of p times - the inverse of the reduction of q, all modulo P; if q is exactly - divisible by P then define the reduction to be infinity. So we've - got a well-defined map - - reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }. - - (2) Now for a rational number x, define hash(x) by: - - reduce(x) if x >= 0 - -reduce(-x) if x < 0 - - If the result of the reduction is infinity (this is impossible for - integers, floats and Decimals) then use the predefined hash value - _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead. - _PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the - hashes of float and Decimal infinities and nans. - - A selling point for the above strategy is that it makes it possible - to compute hashes of decimal and binary floating-point numbers - efficiently, even if the exponent of the binary or decimal number - is large. The key point is that - - reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS) - - provided that {reduce(x), reduce(y)} != {0, infinity}. The reduction of a - binary or decimal float is never infinity, since the denominator is a power - of 2 (for binary) or a divisor of a power of 10 (for decimal). So we have, - for nonnegative x, - - reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS - - reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS - - and reduce(10**e) can be computed efficiently by the usual modular - exponentiation algorithm. For reduce(2**e) it's even better: since - P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication - by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits. - - */ - -Py_hash_t -_Py_HashDouble(double v) -{ - int e, sign; - double m; - Py_uhash_t x, y; - - if (!Py_IS_FINITE(v)) { - if (Py_IS_INFINITY(v)) - return v > 0 ? _PyHASH_INF : -_PyHASH_INF; - else - return _PyHASH_NAN; - } - - m = frexp(v, &e); - - sign = 1; - if (m < 0) { - sign = -1; - m = -m; - } - - /* process 28 bits at a time; this should work well both for binary - and hexadecimal floating point. */ - x = 0; - while (m) { - x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28); - m *= 268435456.0; /* 2**28 */ - e -= 28; - y = (Py_uhash_t)m; /* pull out integer part */ - m -= y; - x += y; - if (x >= _PyHASH_MODULUS) - x -= _PyHASH_MODULUS; - } - - /* adjust for the exponent; first reduce it modulo _PyHASH_BITS */ - e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS); - x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e); - - x = x * sign; - if (x == (Py_uhash_t)-1) - x = (Py_uhash_t)-2; - return (Py_hash_t)x; -} - -Py_hash_t -_Py_HashPointer(void *p) -{ - Py_hash_t x; - size_t y = (size_t)p; - /* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid - excessive hash collisions for dicts and sets */ - y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4)); - x = (Py_hash_t)y; - if (x == -1) - x = -2; - return x; -} - -Py_hash_t -_Py_HashBytes(unsigned char *p, Py_ssize_t len) -{ - Py_uhash_t x; - Py_ssize_t i; - - /* - We make the hash of the empty string be 0, rather than using - (prefix ^ suffix), since this slightly obfuscates the hash secret - */ -#ifdef Py_DEBUG - assert(_Py_HashSecret_Initialized); -#endif - if (len == 0) { - return 0; - } - x = (Py_uhash_t) _Py_HashSecret.prefix; - x ^= (Py_uhash_t) *p << 7; - for (i = 0; i < len; i++) - x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++; - x ^= (Py_uhash_t) len; - x ^= (Py_uhash_t) _Py_HashSecret.suffix; - if (x == -1) - x = -2; - return x; -} - Py_hash_t PyObject_HashNotImplemented(PyObject *v) { @@ -883,8 +739,6 @@ PyObject_HashNotImplemented(PyObject *v) return -1; } -_Py_HashSecret_t _Py_HashSecret; - Py_hash_t PyObject_Hash(PyObject *v) { diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 880889e..3644db3 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -11386,39 +11386,8 @@ unicode_hash(PyObject *self) _PyUnicode_HASH(self) = 0; return 0; } - - /* The hash function as a macro, gets expanded three times below. */ -#define HASH(P) \ - x ^= (Py_uhash_t) *P << 7; \ - while (--len >= 0) \ - x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \ - - x = (Py_uhash_t) _Py_HashSecret.prefix; - switch (PyUnicode_KIND(self)) { - case PyUnicode_1BYTE_KIND: { - const unsigned char *c = PyUnicode_1BYTE_DATA(self); - HASH(c); - break; - } - case PyUnicode_2BYTE_KIND: { - const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self); - HASH(s); - break; - } - default: { - Py_UCS4 *l; - assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND && - "Impossible switch case in unicode_hash"); - l = PyUnicode_4BYTE_DATA(self); - HASH(l); - break; - } - } - x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self); - x ^= (Py_uhash_t) _Py_HashSecret.suffix; - - if (x == -1) - x = -2; + x = _Py_HashBytes(PyUnicode_DATA(self), + PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self)); _PyUnicode_HASH(self) = x; return x; } diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index 1b6e864..c2e1eb3 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -412,6 +412,7 @@ <ClInclude Include="..\Include\patchlevel.h" /> <ClInclude Include="..\Include\pgen.h" /> <ClInclude Include="..\Include\pgenheaders.h" /> + <ClInclude Include="..\Include\pyhash.h" /> <ClInclude Include="..\Include\py_curses.h" /> <ClInclude Include="..\Include\pyarena.h" /> <ClInclude Include="..\Include\pycapsule.h" /> @@ -616,6 +617,7 @@ <ClCompile Include="..\PC\dl_nt.c" /> <ClCompile Include="..\PC\getpathp.c" /> <ClCompile Include="..\PC\msvcrtmodule.c" /> + <ClCompile Include="..\Python\pyhash.c" /> <ClCompile Include="..\Python\random.c" /> <ClCompile Include="..\Python\_warnings.c" /> <ClCompile Include="..\Python\asdl.c" /> diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index e1a9301..41fe1b2 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -421,6 +421,9 @@ <ClInclude Include="..\Python\ceval_gil.h"> <Filter>Python</Filter> </ClInclude> + <ClInclude Include="..\Include\pyhash.h"> + <Filter>Include</Filter> + </ClInclude> </ItemGroup> <ItemGroup> <ClCompile Include="..\Modules\_bisectmodule.c"> @@ -931,6 +934,9 @@ <ClCompile Include="..\Modules\_stat.c"> <Filter>Modules</Filter> </ClCompile> + <ClCompile Include="..\Python\pyhash.c"> + <Filter>Python</Filter> + </ClCompile> </ItemGroup> <ItemGroup> <ResourceCompile Include="..\PC\python_nt.rc"> diff --git a/Python/pyhash.c b/Python/pyhash.c new file mode 100644 index 0000000..158c631 --- /dev/null +++ b/Python/pyhash.c @@ -0,0 +1,430 @@ +/* Set of hash utility functions to help maintaining the invariant that + if a==b then hash(a)==hash(b) + + All the utility functions (_Py_Hash*()) return "-1" to signify an error. +*/ +#include "Python.h" + +#ifdef __APPLE__ +# include <libkern/OSByteOrder.h> +#elif defined(HAVE_LE64TOH) && defined(HAVE_ENDIAN_H) +# include <endian.h> +#elif defined(HAVE_LE64TOH) && defined(HAVE_SYS_ENDIAN_H) +# include <sys/endian.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +_Py_HashSecret_t _Py_HashSecret; + +#if Py_HASH_ALGORITHM == Py_HASH_EXTERNAL +extern PyHash_FuncDef PyHash_Func; +#else +static PyHash_FuncDef PyHash_Func; +#endif + +/* Count _Py_HashBytes() calls */ +#ifdef Py_HASH_STATS +#define Py_HASH_STATS_MAX 32 +static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0}; +#endif + +/* For numeric types, the hash of a number x is based on the reduction + of x modulo the prime P = 2**_PyHASH_BITS - 1. It's designed so that + hash(x) == hash(y) whenever x and y are numerically equal, even if + x and y have different types. + + A quick summary of the hashing strategy: + + (1) First define the 'reduction of x modulo P' for any rational + number x; this is a standard extension of the usual notion of + reduction modulo P for integers. If x == p/q (written in lowest + terms), the reduction is interpreted as the reduction of p times + the inverse of the reduction of q, all modulo P; if q is exactly + divisible by P then define the reduction to be infinity. So we've + got a well-defined map + + reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }. + + (2) Now for a rational number x, define hash(x) by: + + reduce(x) if x >= 0 + -reduce(-x) if x < 0 + + If the result of the reduction is infinity (this is impossible for + integers, floats and Decimals) then use the predefined hash value + _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead. + _PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the + hashes of float and Decimal infinities and nans. + + A selling point for the above strategy is that it makes it possible + to compute hashes of decimal and binary floating-point numbers + efficiently, even if the exponent of the binary or decimal number + is large. The key point is that + + reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS) + + provided that {reduce(x), reduce(y)} != {0, infinity}. The reduction of a + binary or decimal float is never infinity, since the denominator is a power + of 2 (for binary) or a divisor of a power of 10 (for decimal). So we have, + for nonnegative x, + + reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS + + reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS + + and reduce(10**e) can be computed efficiently by the usual modular + exponentiation algorithm. For reduce(2**e) it's even better: since + P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication + by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits. + + */ + +Py_hash_t +_Py_HashDouble(double v) +{ + int e, sign; + double m; + Py_uhash_t x, y; + + if (!Py_IS_FINITE(v)) { + if (Py_IS_INFINITY(v)) + return v > 0 ? _PyHASH_INF : -_PyHASH_INF; + else + return _PyHASH_NAN; + } + + m = frexp(v, &e); + + sign = 1; + if (m < 0) { + sign = -1; + m = -m; + } + + /* process 28 bits at a time; this should work well both for binary + and hexadecimal floating point. */ + x = 0; + while (m) { + x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28); + m *= 268435456.0; /* 2**28 */ + e -= 28; + y = (Py_uhash_t)m; /* pull out integer part */ + m -= y; + x += y; + if (x >= _PyHASH_MODULUS) + x -= _PyHASH_MODULUS; + } + + /* adjust for the exponent; first reduce it modulo _PyHASH_BITS */ + e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS); + x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e); + + x = x * sign; + if (x == (Py_uhash_t)-1) + x = (Py_uhash_t)-2; + return (Py_hash_t)x; +} + +Py_hash_t +_Py_HashPointer(void *p) +{ + Py_hash_t x; + size_t y = (size_t)p; + /* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid + excessive hash collisions for dicts and sets */ + y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4)); + x = (Py_hash_t)y; + if (x == -1) + x = -2; + return x; +} + +Py_hash_t +_Py_HashBytes(const void *src, Py_ssize_t len) +{ + Py_hash_t x; + /* + We make the hash of the empty string be 0, rather than using + (prefix ^ suffix), since this slightly obfuscates the hash secret + */ + if (len == 0) { + return 0; + } + +#ifdef Py_HASH_STATS + hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++; +#endif + +#if Py_HASH_CUTOFF > 0 + if (len < Py_HASH_CUTOFF) { + /* Optimize hashing of very small strings with inline DJBX33A. */ + Py_uhash_t hash; + const unsigned char *p = src; + hash = 5381; /* DJBX33A starts with 5381 */ + + switch(len) { + /* ((hash << 5) + hash) + *p == hash * 33 + *p */ + case 7: hash = ((hash << 5) + hash) + *p++; /* fallthrough */ + case 6: hash = ((hash << 5) + hash) + *p++; /* fallthrough */ + case 5: hash = ((hash << 5) + hash) + *p++; /* fallthrough */ + case 4: hash = ((hash << 5) + hash) + *p++; /* fallthrough */ + case 3: hash = ((hash << 5) + hash) + *p++; /* fallthrough */ + case 2: hash = ((hash << 5) + hash) + *p++; /* fallthrough */ + case 1: hash = ((hash << 5) + hash) + *p++; break; + default: + assert(0); + } + hash ^= len; + hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix; + x = (Py_hash_t)hash; + } + else +#endif /* Py_HASH_CUTOFF */ + x = PyHash_Func.hash(src, len); + + if (x == -1) + return -2; + return x; +} + +void +_PyHash_Fini(void) +{ +#ifdef Py_HASH_STATS + int i; + Py_ssize_t total = 0; + char *fmt = "%2i %8" PY_FORMAT_SIZE_T "d %8" PY_FORMAT_SIZE_T "d\n"; + + fprintf(stderr, "len calls total\n"); + for (i = 1; i <= Py_HASH_STATS_MAX; i++) { + total += hashstats[i]; + fprintf(stderr, fmt, i, hashstats[i], total); + } + total += hashstats[0]; + fprintf(stderr, "> %8" PY_FORMAT_SIZE_T "d %8" PY_FORMAT_SIZE_T "d\n", + hashstats[0], total); +#endif +} + +PyHash_FuncDef * +PyHash_GetFuncDef(void) +{ + return &PyHash_Func; +} + +/* Optimized memcpy() for Windows */ +#ifdef _MSC_VER +# if SIZEOF_PY_UHASH_T == 4 +# define PY_UHASH_CPY(dst, src) do { \ + dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \ + } while(0) +# elif SIZEOF_PY_UHASH_T == 8 +# define PY_UHASH_CPY(dst, src) do { \ + dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \ + dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7]; \ + } while(0) +# else +# error SIZEOF_PY_UHASH_T must be 4 or 8 +# endif /* SIZEOF_PY_UHASH_T */ +#else /* not Windows */ +# define PY_UHASH_CPY(dst, src) memcpy(dst, src, SIZEOF_PY_UHASH_T) +#endif /* _MSC_VER */ + + +#if Py_HASH_ALGORITHM == Py_HASH_FNV +/* ************************************************************************** + * Modified Fowler-Noll-Vo (FNV) hash function + */ +static Py_hash_t +fnv(const void *src, Py_ssize_t len) +{ + const unsigned char *p = src; + Py_uhash_t x; + Py_ssize_t remainder, blocks; + union { + Py_uhash_t value; + unsigned char bytes[SIZEOF_PY_UHASH_T]; + } block; + +#ifdef Py_DEBUG + assert(_Py_HashSecret_Initialized); +#endif + remainder = len % SIZEOF_PY_UHASH_T; + if (remainder == 0) { + /* Process at least one block byte by byte to reduce hash collisions + * for strings with common prefixes. */ + remainder = SIZEOF_PY_UHASH_T; + } + blocks = (len - remainder) / SIZEOF_PY_UHASH_T; + + x = (Py_uhash_t) _Py_HashSecret.fnv.prefix; + x ^= (Py_uhash_t) *p << 7; + while (blocks--) { + PY_UHASH_CPY(block.bytes, p); + x = (_PyHASH_MULTIPLIER * x) ^ block.value; + p += SIZEOF_PY_UHASH_T; + } + /* add remainder */ + for (; remainder > 0; remainder--) + x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++; + x ^= (Py_uhash_t) len; + x ^= (Py_uhash_t) _Py_HashSecret.fnv.suffix; + if (x == -1) { + x = -2; + } + return x; +} + +static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T, + 16 * SIZEOF_PY_HASH_T}; + +#endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */ + + +#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24 +/* ************************************************************************** + <MIT License> + Copyright (c) 2013 Marek Majkowski <marek@popcount.org> + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + </MIT License> + + Original location: + https://github.com/majek/csiphash/ + + Solution inspired by code from: + Samuel Neves (supercop/crypto_auth/siphash24/little) + djb (supercop/crypto_auth/siphash24/little2) + Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c) + + Modified for Python by Christian Heimes: + - C89 / MSVC compatibility + - PY_UINT64_T, PY_UINT32_T and PY_UINT8_T + - _rotl64() on Windows + - letoh64() fallback +*/ + +typedef unsigned char PY_UINT8_T; + +/* byte swap little endian to host endian + * Endian conversion not only ensures that the hash function returns the same + * value on all platforms. It is also required to for a good dispersion of + * the hash values' least significant bits. + */ +#if PY_LITTLE_ENDIAN +# define _le64toh(x) ((PY_UINT64_T)(x)) +#elif defined(__APPLE__) +# define _le64toh(x) OSSwapLittleToHostInt64(x) +#elif defined(HAVE_LETOH64) +# define _le64toh(x) le64toh(x) +#else +# define _le64toh(x) (((PY_UINT64_T)(x) << 56) | \ + (((PY_UINT64_T)(x) << 40) & 0xff000000000000ULL) | \ + (((PY_UINT64_T)(x) << 24) & 0xff0000000000ULL) | \ + (((PY_UINT64_T)(x) << 8) & 0xff00000000ULL) | \ + (((PY_UINT64_T)(x) >> 8) & 0xff000000ULL) | \ + (((PY_UINT64_T)(x) >> 24) & 0xff0000ULL) | \ + (((PY_UINT64_T)(x) >> 40) & 0xff00ULL) | \ + ((PY_UINT64_T)(x) >> 56)) +#endif + + +#ifdef _MSC_VER +# define ROTATE(x, b) _rotl64(x, b) +#else +# define ROTATE(x, b) (PY_UINT64_T)( ((x) << (b)) | ( (x) >> (64 - (b))) ) +#endif + +#define HALF_ROUND(a,b,c,d,s,t) \ + a += b; c += d; \ + b = ROTATE(b, s) ^ a; \ + d = ROTATE(d, t) ^ c; \ + a = ROTATE(a, 32); + +#define DOUBLE_ROUND(v0,v1,v2,v3) \ + HALF_ROUND(v0,v1,v2,v3,13,16); \ + HALF_ROUND(v2,v1,v0,v3,17,21); \ + HALF_ROUND(v0,v1,v2,v3,13,16); \ + HALF_ROUND(v2,v1,v0,v3,17,21); + + +static Py_hash_t +siphash24(const void *src, Py_ssize_t src_sz) { + PY_UINT64_T k0 = _le64toh(_Py_HashSecret.siphash.k0); + PY_UINT64_T k1 = _le64toh(_Py_HashSecret.siphash.k1); + PY_UINT64_T b = (PY_UINT64_T)src_sz << 56; + const PY_UINT64_T *in = (PY_UINT64_T*)src; + + PY_UINT64_T v0 = k0 ^ 0x736f6d6570736575ULL; + PY_UINT64_T v1 = k1 ^ 0x646f72616e646f6dULL; + PY_UINT64_T v2 = k0 ^ 0x6c7967656e657261ULL; + PY_UINT64_T v3 = k1 ^ 0x7465646279746573ULL; + + PY_UINT64_T t; + PY_UINT8_T *pt; + PY_UINT8_T *m; + + while (src_sz >= 8) { + PY_UINT64_T mi = _le64toh(*in); + in += 1; + src_sz -= 8; + v3 ^= mi; + DOUBLE_ROUND(v0,v1,v2,v3); + v0 ^= mi; + } + + t = 0; + pt = (PY_UINT8_T *)&t; + m = (PY_UINT8_T *)in; + switch (src_sz) { + case 7: pt[6] = m[6]; + case 6: pt[5] = m[5]; + case 5: pt[4] = m[4]; + case 4: *((PY_UINT32_T*)&pt[0]) = *((PY_UINT32_T*)&m[0]); break; + case 3: pt[2] = m[2]; + case 2: pt[1] = m[1]; + case 1: pt[0] = m[0]; + } + b |= _le64toh(t); + + v3 ^= b; + DOUBLE_ROUND(v0,v1,v2,v3); + v0 ^= b; + v2 ^= 0xff; + DOUBLE_ROUND(v0,v1,v2,v3); + DOUBLE_ROUND(v0,v1,v2,v3); + + /* modified */ + t = (v0 ^ v1) ^ (v2 ^ v3); +#if SIZEOF_VOID_P == 4 + t ^= (t >> 32); +#endif + return (Py_hash_t)t; +} + +static PyHash_FuncDef PyHash_Func = {siphash24, "siphash24", 64, 128}; + +#endif /* Py_HASH_ALGORITHM == Py_HASH_SIPHASH24 */ + +#ifdef __cplusplus +} +#endif diff --git a/Python/pythonrun.c b/Python/pythonrun.c index e427be3..b5d57df 100644 --- a/Python/pythonrun.c +++ b/Python/pythonrun.c @@ -104,6 +104,7 @@ extern int _PyLong_Init(void); extern void PyLong_Fini(void); extern int _PyFaulthandler_Init(void); extern void _PyFaulthandler_Fini(void); +extern void _PyHash_Fini(void); #ifdef WITH_THREAD extern void _PyGILState_Init(PyInterpreterState *, PyThreadState *); @@ -650,6 +651,8 @@ Py_Finalize(void) #ifdef COUNT_ALLOCS dump_counts(stdout); #endif + /* dump hash stats */ + _PyHash_Fini(); PRINT_TOTAL_REFS(); diff --git a/Python/random.c b/Python/random.c index d9c7e77..de8e9e7 100644 --- a/Python/random.c +++ b/Python/random.c @@ -95,7 +95,7 @@ static int urandom_fd = -1; /* Read size bytes from /dev/urandom into buffer. Call Py_FatalError() on error. */ static void -dev_urandom_noraise(char *buffer, Py_ssize_t size) +dev_urandom_noraise(unsigned char *buffer, Py_ssize_t size) { int fd; Py_ssize_t n; @@ -249,8 +249,9 @@ void _PyRandom_Init(void) { char *env; - void *secret = &_Py_HashSecret; + unsigned char *secret = (unsigned char *)&_Py_HashSecret.uc; Py_ssize_t secret_size = sizeof(_Py_HashSecret_t); + assert(secret_size == sizeof(_Py_HashSecret.uc)); if (_Py_HashSecret_Initialized) return; @@ -278,17 +279,17 @@ _PyRandom_Init(void) memset(secret, 0, secret_size); } else { - lcg_urandom(seed, (unsigned char*)secret, secret_size); + lcg_urandom(seed, secret, secret_size); } } else { #ifdef MS_WINDOWS - (void)win32_urandom((unsigned char *)secret, secret_size, 0); + (void)win32_urandom(secret, secret_size, 0); #else /* #ifdef MS_WINDOWS */ # ifdef __VMS - vms_urandom((unsigned char *)secret, secret_size, 0); + vms_urandom(secret, secret_size, 0); # else - dev_urandom_noraise((char*)secret, secret_size); + dev_urandom_noraise(secret, secret_size); # endif #endif } diff --git a/Python/sysmodule.c b/Python/sysmodule.c index 35a0671..4028a01 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -658,7 +658,7 @@ PyDoc_STRVAR(hash_info_doc, "hash_info\n\ \n\ A struct sequence providing parameters used for computing\n\ -numeric hashes. The attributes are read only."); +hashes. The attributes are read only."); static PyStructSequence_Field hash_info_fields[] = { {"width", "width of the type used for hashing, in bits"}, @@ -667,6 +667,11 @@ static PyStructSequence_Field hash_info_fields[] = { {"inf", "value to be used for hash of a positive infinity"}, {"nan", "value to be used for hash of a nan"}, {"imag", "multiplier used for the imaginary part of a complex number"}, + {"algorithm", "name of the algorithm for hashing of str, bytes and " + "memoryviews"}, + {"hash_bits", "internal output size of hash algorithm"}, + {"seed_bits", "seed size of hash algorithm"}, + {"cutoff", "small string optimization cutoff"}, {NULL, NULL} }; @@ -674,7 +679,7 @@ static PyStructSequence_Desc hash_info_desc = { "sys.hash_info", hash_info_doc, hash_info_fields, - 5, + 9, }; static PyObject * @@ -682,9 +687,11 @@ get_hash_info(void) { PyObject *hash_info; int field = 0; + PyHash_FuncDef *hashfunc; hash_info = PyStructSequence_New(&Hash_InfoType); if (hash_info == NULL) return NULL; + hashfunc = PyHash_GetFuncDef(); PyStructSequence_SET_ITEM(hash_info, field++, PyLong_FromLong(8*sizeof(Py_hash_t))); PyStructSequence_SET_ITEM(hash_info, field++, @@ -695,6 +702,14 @@ get_hash_info(void) PyLong_FromLong(_PyHASH_NAN)); PyStructSequence_SET_ITEM(hash_info, field++, PyLong_FromLong(_PyHASH_IMAG)); + PyStructSequence_SET_ITEM(hash_info, field++, + PyUnicode_FromString(hashfunc->name)); + PyStructSequence_SET_ITEM(hash_info, field++, + PyLong_FromLong(hashfunc->hash_bits)); + PyStructSequence_SET_ITEM(hash_info, field++, + PyLong_FromLong(hashfunc->seed_bits)); + PyStructSequence_SET_ITEM(hash_info, field++, + PyLong_FromLong(Py_HASH_CUTOFF)); if (PyErr_Occurred()) { Py_CLEAR(hash_info); return NULL; @@ -1338,6 +1353,7 @@ exec_prefix -- prefix used to find the machine-specific Python library\n\ executable -- absolute path of the executable binary of the Python interpreter\n\ float_info -- a struct sequence with information about the float implementation.\n\ float_repr_style -- string indicating the style of repr() output for floats\n\ +hash_info -- a struct sequence with information about the hash algorithm.\n\ hexversion -- version information encoded as a single integer\n\ implementation -- Python implementation information.\n\ int_info -- a struct sequence with information about the int implementation.\n\ @@ -792,6 +792,7 @@ with_suffix enable_shared enable_profiling with_pydebug +with_hash_algorithm with_libs with_system_expat with_system_ffi @@ -1465,6 +1466,8 @@ Optional Packages: compiler --with-suffix=.exe set executable suffix --with-pydebug build with Py_DEBUG defined + --with-hash-algorithm=[fnv|siphash24] + select hash algorithm --with-libs='lib1 ...' link against additional libs --with-system-expat build pyexpat module using an installed expat library @@ -6956,7 +6959,8 @@ sys/param.h sys/select.h sys/sendfile.h sys/socket.h sys/statvfs.h \ sys/stat.h sys/syscall.h sys/sys_domain.h sys/termio.h sys/time.h \ sys/times.h sys/types.h sys/uio.h sys/un.h sys/utsname.h sys/wait.h pty.h \ libutil.h sys/resource.h netpacket/packet.h sysexits.h bluetooth.h \ -bluetooth/bluetooth.h linux/tipc.h spawn.h util.h alloca.h +bluetooth/bluetooth.h linux/tipc.h spawn.h util.h alloca.h endian.h \ +sys/endian.h do : as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh` ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default" @@ -7330,6 +7334,43 @@ $as_echo "#define HAVE_MAKEDEV 1" >>confdefs.h fi +# byte swapping +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for le64toh" >&5 +$as_echo_n "checking for le64toh... " >&6; } +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +#ifdef HAVE_ENDIAN_H +#include <endian.h> +#elif defined(HAVE_SYS_ENDIAN_H) +#include <sys/endian.h> +#endif + +int +main () +{ + + le64toh(1) + ; + return 0; +} + +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_has_le64toh=yes +else + ac_cv_has_le64toh=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_has_le64toh" >&5 +$as_echo "$ac_cv_has_le64toh" >&6; } +if test "$ac_cv_has_le64toh" = "yes"; then + +$as_echo "#define HAVE_HTOLE64 1" >>confdefs.h + +fi + # Enabling LFS on Solaris (2.6 to 9) with gcc 2.95 triggers a bug in # the system headers: If _XOPEN_SOURCE and _LARGEFILE_SOURCE are # defined, but the compiler does not support pragma redefine_extname, @@ -8987,6 +9028,79 @@ rm -f core conftest.err conftest.$ac_objext \ *) ;; esac +# check for systems that require aligned memory access +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking aligned memory access is required" >&5 +$as_echo_n "checking aligned memory access is required... " >&6; } +if test "$cross_compiling" = yes; then : + aligned_required=yes +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int main() +{ + char s[16]; + int i, *p1, *p2; + for (i=0; i < 16; i++) + s[i] = i; + p1 = (int*)(s+1); + p2 = (int*)(s+2); + if (*p1 == *p2) + return 1; + return 0; +} + +_ACEOF +if ac_fn_c_try_run "$LINENO"; then : + aligned_required=no +else + aligned_required=yes +fi +rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \ + conftest.$ac_objext conftest.beam conftest.$ac_ext +fi + + +if test "$aligned_required" = yes ; then + +$as_echo "#define HAVE_ALIGNED_REQUIRED 1" >>confdefs.h + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $aligned_required" >&5 +$as_echo "$aligned_required" >&6; } + + +# str, bytes and memoryview hash algorithm + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for --with-hash-algorithm" >&5 +$as_echo_n "checking for --with-hash-algorithm... " >&6; } + +# Check whether --with-hash_algorithm was given. +if test "${with_hash_algorithm+set}" = set; then : + withval=$with_hash_algorithm; +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $withval" >&5 +$as_echo "$withval" >&6; } +case "$withval" in + siphash24) + $as_echo "#define Py_HASH_ALGORITHM 1" >>confdefs.h + + ;; + fnv) + $as_echo "#define Py_HASH_ALGORITHM 2" >>confdefs.h + + ;; + *) + as_fn_error $? "unknown hash algorithm '$withval'" "$LINENO" 5 + ;; +esac + +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: default" >&5 +$as_echo "default" >&6; } +fi + + # Most SVR4 platforms (e.g. Solaris) need -lsocket and -lnsl. { $as_echo "$as_me:${as_lineno-$LINENO}: checking for t_open in -lnsl" >&5 $as_echo_n "checking for t_open in -lnsl... " >&6; } diff --git a/configure.ac b/configure.ac index 369c7e5..5ba7d03 100644 --- a/configure.ac +++ b/configure.ac @@ -1543,7 +1543,8 @@ sys/param.h sys/select.h sys/sendfile.h sys/socket.h sys/statvfs.h \ sys/stat.h sys/syscall.h sys/sys_domain.h sys/termio.h sys/time.h \ sys/times.h sys/types.h sys/uio.h sys/un.h sys/utsname.h sys/wait.h pty.h \ libutil.h sys/resource.h netpacket/packet.h sysexits.h bluetooth.h \ -bluetooth/bluetooth.h linux/tipc.h spawn.h util.h alloca.h) +bluetooth/bluetooth.h linux/tipc.h spawn.h util.h alloca.h endian.h \ +sys/endian.h) CPPFLAGS=$ac_save_cppflags AC_HEADER_DIRENT AC_HEADER_MAJOR @@ -1614,6 +1615,22 @@ if test "$ac_cv_has_makedev" = "yes"; then AC_DEFINE(HAVE_MAKEDEV, 1, [Define this if you have the makedev macro.]) fi +# byte swapping +AC_MSG_CHECKING(for le64toh) +AC_LINK_IFELSE([AC_LANG_PROGRAM([[ +#ifdef HAVE_ENDIAN_H +#include <endian.h> +#elif defined(HAVE_SYS_ENDIAN_H) +#include <sys/endian.h> +#endif +]], [[ + le64toh(1) ]]) +],[ac_cv_has_le64toh=yes],[ac_cv_has_le64toh=no]) +AC_MSG_RESULT($ac_cv_has_le64toh) +if test "$ac_cv_has_le64toh" = "yes"; then + AC_DEFINE(HAVE_HTOLE64, 1, [Define this if you have le64toh()]) +fi + # Enabling LFS on Solaris (2.6 to 9) with gcc 2.95 triggers a bug in # the system headers: If _XOPEN_SOURCE and _LARGEFILE_SOURCE are # defined, but the compiler does not support pragma redefine_extname, @@ -2229,6 +2246,59 @@ case "$ac_sys_system" in *) ;; esac +# check for systems that require aligned memory access +AC_MSG_CHECKING(aligned memory access is required) +AC_TRY_RUN([ +int main() +{ + char s[16]; + int i, *p1, *p2; + for (i=0; i < 16; i++) + s[i] = i; + p1 = (int*)(s+1); + p2 = (int*)(s+2); + if (*p1 == *p2) + return 1; + return 0; +} + ], + [aligned_required=no], + [aligned_required=yes], + [aligned_required=yes]) + +if test "$aligned_required" = yes ; then + AC_DEFINE([HAVE_ALIGNED_REQUIRED], [1], + [Define if aligned memory access is required]) +fi +AC_MSG_RESULT($aligned_required) + + +# str, bytes and memoryview hash algorithm +AH_TEMPLATE(Py_HASH_ALGORITHM, + [Define hash algorithm for str, bytes and memoryview. + SipHash24: 1, FNV: 2, externally defined: 0]) + +AC_MSG_CHECKING(for --with-hash-algorithm) +dnl quadrigraphs "@<:@" and "@:>@" produce "[" and "]" in the output +AC_ARG_WITH(hash_algorithm, + AS_HELP_STRING([--with-hash-algorithm=@<:@fnv|siphash24@:>@], + [select hash algorithm]), +[ +AC_MSG_RESULT($withval) +case "$withval" in + siphash24) + AC_DEFINE(Py_HASH_ALGORITHM, 1) + ;; + fnv) + AC_DEFINE(Py_HASH_ALGORITHM, 2) + ;; + *) + AC_MSG_ERROR([unknown hash algorithm '$withval']) + ;; +esac +], +[AC_MSG_RESULT(default)]) + # Most SVR4 platforms (e.g. Solaris) need -lsocket and -lnsl. AC_CHECK_LIB(nsl, t_open, [LIBS="-lnsl $LIBS"]) # SVR4 AC_CHECK_LIB(socket, socket, [LIBS="-lsocket $LIBS"], [], $LIBS) # SVR4 sockets diff --git a/pyconfig.h.in b/pyconfig.h.in index 13979fc..29e1bfa 100644 --- a/pyconfig.h.in +++ b/pyconfig.h.in @@ -49,6 +49,9 @@ /* Define to 1 if you have the `alarm' function. */ #undef HAVE_ALARM +/* Define if aligned memory access is required */ +#undef HAVE_ALIGNED_REQUIRED + /* Define to 1 if you have the <alloca.h> header file. */ #undef HAVE_ALLOCA_H @@ -199,6 +202,9 @@ /* Defined when any dynamic module loading is enabled. */ #undef HAVE_DYNAMIC_LOADING +/* Define to 1 if you have the <endian.h> header file. */ +#undef HAVE_ENDIAN_H + /* Define if you have the 'epoll' functions. */ #undef HAVE_EPOLL @@ -408,6 +414,9 @@ /* Define if you have the 'hstrerror' function. */ #undef HAVE_HSTRERROR +/* Define this if you have le64toh() */ +#undef HAVE_HTOLE64 + /* Define to 1 if you have the `hypot' function. */ #undef HAVE_HYPOT @@ -927,6 +936,9 @@ */ #undef HAVE_SYS_DIR_H +/* Define to 1 if you have the <sys/endian.h> header file. */ +#undef HAVE_SYS_ENDIAN_H + /* Define to 1 if you have the <sys/epoll.h> header file. */ #undef HAVE_SYS_EPOLL_H @@ -1193,6 +1205,10 @@ /* Defined if Python is built as a shared library. */ #undef Py_ENABLE_SHARED +/* Define hash algorithm for str, bytes and memoryview. SipHash24: 1, FNV: 2, + externally defined: 0 */ +#undef Py_HASH_ALGORITHM + /* assume C89 semantics that RETSIGTYPE is always void */ #undef RETSIGTYPE |