From 0ded5b54bb499865fb4ab8c0ac3d0977df9a334d Mon Sep 17 00:00:00 2001 From: Christian Heimes Date: Mon, 10 Dec 2007 15:50:56 +0000 Subject: Fixed issue #1564: The set implementation should special-case PyUnicode instead of PyString I moved the unicode_eq to stringlib/eq.h to keep the function static and possible inline for setobject.c and dictobject.h. I also removed the unused _PyString_Eq method. If it's required in the future it can be added to eq.h --- Include/stringobject.h | 1 - Lib/test/test_set.py | 42 ++++++++++++++++++++++++++++++++++++++++++ Misc/NEWS | 3 +++ Objects/dictobject.c | 20 +------------------- Objects/setobject.c | 33 ++++++++++++++++++--------------- Objects/stringlib/eq.h | 21 +++++++++++++++++++++ Objects/stringobject.c | 10 ---------- 7 files changed, 85 insertions(+), 45 deletions(-) create mode 100644 Objects/stringlib/eq.h diff --git a/Include/stringobject.h b/Include/stringobject.h index 184bcdd..6a63576 100644 --- a/Include/stringobject.h +++ b/Include/stringobject.h @@ -58,7 +58,6 @@ PyAPI_FUNC(PyObject *) PyString_Repr(PyObject *, int); PyAPI_FUNC(void) PyString_Concat(PyObject **, PyObject *); PyAPI_FUNC(void) PyString_ConcatAndDel(PyObject **, PyObject *); PyAPI_FUNC(int) _PyString_Resize(PyObject **, Py_ssize_t); -PyAPI_FUNC(int) _PyString_Eq(PyObject *, PyObject*); PyAPI_FUNC(PyObject *) PyString_Format(PyObject *, PyObject *); PyAPI_FUNC(PyObject *) _PyString_FormatLong(PyObject*, int, int, int, char**, int*); diff --git a/Lib/test/test_set.py b/Lib/test/test_set.py index ba5801d..e761f19 100644 --- a/Lib/test/test_set.py +++ b/Lib/test/test_set.py @@ -7,6 +7,7 @@ import pickle import os from random import randrange, shuffle import sys +import warnings class PassThru(Exception): pass @@ -817,6 +818,44 @@ class TestBasicOpsTriple(TestBasicOps): self.length = 3 self.repr = None +#------------------------------------------------------------------------------ + +class TestBasicOpsString(TestBasicOps): + def setUp(self): + self.case = "string set" + self.values = ["a", "b", "c"] + self.set = set(self.values) + self.dup = set(self.values) + self.length = 3 + self.repr = "{'a', 'c', 'b'}" + +#------------------------------------------------------------------------------ + +class TestBasicOpsBytes(TestBasicOps): + def setUp(self): + self.case = "string set" + self.values = [b"a", b"b", b"c"] + self.set = set(self.values) + self.dup = set(self.values) + self.length = 3 + self.repr = "{b'a', b'c', b'b'}" + +#------------------------------------------------------------------------------ + +class TestBasicOpsMixedStringBytes(TestBasicOps): + def setUp(self): + self.warning_filters = warnings.filters[:] + warnings.simplefilter('ignore', BytesWarning) + self.case = "string and bytes set" + self.values = ["a", "b", b"a", b"b"] + self.set = set(self.values) + self.dup = set(self.values) + self.length = 4 + self.repr = "{'a', b'a', 'b', b'b'}" + + def tearDown(self): + warnings.filters = self.warning_filters + #============================================================================== def baditer(): @@ -1581,6 +1620,9 @@ def test_main(verbose=None): TestBasicOpsSingleton, TestBasicOpsTuple, TestBasicOpsTriple, + TestBasicOpsString, + TestBasicOpsBytes, + TestBasicOpsMixedStringBytes, TestBinaryOps, TestUpdateOps, TestMutate, diff --git a/Misc/NEWS b/Misc/NEWS index f5ab868..bd8a3e7 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -14,6 +14,9 @@ Core and Builtins - Issue #1573: Improper use of the keyword-only syntax makes the parser crash +- Issue #1564: The set implementation should special-case PyUnicode instead + of PyString + Extension Modules ----------------- diff --git a/Objects/dictobject.c b/Objects/dictobject.c index ae400b6..05252d7 100644 --- a/Objects/dictobject.c +++ b/Objects/dictobject.c @@ -8,6 +8,7 @@ */ #include "Python.h" +#include "stringlib/eq.h" /* Set a key error with the specified argument, wrapping it in a @@ -327,25 +328,6 @@ lookdict(PyDictObject *mp, PyObject *key, register long hash) return 0; } -/* Return 1 if two unicode objects are equal, 0 if not. */ -static int -unicode_eq(PyObject *aa, PyObject *bb) -{ - PyUnicodeObject *a = (PyUnicodeObject *)aa; - PyUnicodeObject *b = (PyUnicodeObject *)bb; - - if (a->length != b->length) - return 0; - if (a->length == 0) - return 1; - if (a->str[0] != b->str[0]) - return 0; - if (a->length == 1) - return 1; - return memcmp(a->str, b->str, a->length * sizeof(Py_UNICODE)) == 0; -} - - /* * Hacked up version of lookdict which can assume keys are always * unicodes; this assumption allows testing for errors during diff --git a/Objects/setobject.c b/Objects/setobject.c index f22268e..cd63968 100644 --- a/Objects/setobject.c +++ b/Objects/setobject.c @@ -9,6 +9,7 @@ #include "Python.h" #include "structmember.h" +#include "stringlib/eq.h" /* Set a key error with the specified argument, wrapping it in a * tuple automatically so that tuple keys are not unpacked as the @@ -55,6 +56,7 @@ _PySet_Dummy(void) static PySetObject *free_sets[MAXFREESETS]; static int num_free_sets = 0; + /* The basic lookup function used by all operations. This is based on Algorithm D from Knuth Vol. 3, Sec. 6.4. @@ -144,12 +146,12 @@ set_lookkey(PySetObject *so, PyObject *key, register long hash) } /* - * Hacked up version of set_lookkey which can assume keys are always strings; - * This means we can always use _PyString_Eq directly and not have to check to + * Hacked up version of set_lookkey which can assume keys are always unicode; + * This means we can always use unicode_eq directly and not have to check to * see if the comparison altered the table. */ static setentry * -set_lookkey_string(PySetObject *so, PyObject *key, register long hash) +set_lookkey_unicode(PySetObject *so, PyObject *key, register long hash) { register Py_ssize_t i; register size_t perturb; @@ -158,11 +160,11 @@ set_lookkey_string(PySetObject *so, PyObject *key, register long hash) setentry *table = so->table; register setentry *entry; - /* Make sure this function doesn't have to handle non-string keys, + /* Make sure this function doesn't have to handle non-unicode keys, including subclasses of str; e.g., one reason to subclass strings is to override __eq__, and for speed we don't cater to that here. */ - if (!PyString_CheckExact(key)) { + if (!PyUnicode_CheckExact(key)) { so->lookup = set_lookkey; return set_lookkey(so, key, hash); } @@ -173,7 +175,7 @@ set_lookkey_string(PySetObject *so, PyObject *key, register long hash) if (entry->key == dummy) freeslot = entry; else { - if (entry->hash == hash && _PyString_Eq(entry->key, key)) + if (entry->hash == hash && unicode_eq(entry->key, key)) return entry; freeslot = NULL; } @@ -188,7 +190,7 @@ set_lookkey_string(PySetObject *so, PyObject *key, register long hash) if (entry->key == key || (entry->hash == hash && entry->key != dummy - && _PyString_Eq(entry->key, key))) + && unicode_eq(entry->key, key))) return entry; if (entry->key == dummy && freeslot == NULL) freeslot = entry; @@ -375,8 +377,8 @@ set_add_key(register PySetObject *so, PyObject *key) register long hash; register Py_ssize_t n_used; - if (!PyString_CheckExact(key) || - (hash = ((PyStringObject *) key)->ob_shash) == -1) { + if (!PyUnicode_CheckExact(key) || + (hash = ((PyUnicodeObject *) key)->hash) == -1) { hash = PyObject_Hash(key); if (hash == -1) return -1; @@ -422,8 +424,9 @@ set_discard_key(PySetObject *so, PyObject *key) PyObject *old_key; assert (PyAnySet_Check(so)); - if (!PyString_CheckExact(key) || - (hash = ((PyStringObject *) key)->ob_shash) == -1) { + + if (!PyUnicode_CheckExact(key) || + (hash = ((PyUnicodeObject *) key)->hash) == -1) { hash = PyObject_Hash(key); if (hash == -1) return -1; @@ -668,8 +671,8 @@ set_contains_key(PySetObject *so, PyObject *key) long hash; setentry *entry; - if (!PyString_CheckExact(key) || - (hash = ((PyStringObject *) key)->ob_shash) == -1) { + if (!PyUnicode_CheckExact(key) || + (hash = ((PyUnicodeObject *) key)->hash) == -1) { hash = PyObject_Hash(key); if (hash == -1) return -1; @@ -989,7 +992,7 @@ make_new_set(PyTypeObject *type, PyObject *iterable) INIT_NONZERO_SET_SLOTS(so); } - so->lookup = set_lookkey_string; + so->lookup = set_lookkey_unicode; so->weakreflist = NULL; if (iterable != NULL) { @@ -1352,7 +1355,7 @@ set_isdisjoint(PySetObject *so, PyObject *other) while ((key = PyIter_Next(it)) != NULL) { int rv; setentry entry; - long hash = PyObject_Hash(key); + long hash = PyObject_Hash(key);; if (hash == -1) { Py_DECREF(key); diff --git a/Objects/stringlib/eq.h b/Objects/stringlib/eq.h new file mode 100644 index 0000000..4e989dc --- /dev/null +++ b/Objects/stringlib/eq.h @@ -0,0 +1,21 @@ +/* Fast unicode equal function optimized for dictobject.c and setobject.c */ + +/* Return 1 if two unicode objects are equal, 0 if not. + * unicode_eq() is called when the hash of two unicode objects is equal. + */ +Py_LOCAL_INLINE(int) +unicode_eq(PyObject *aa, PyObject *bb) +{ + register PyUnicodeObject *a = (PyUnicodeObject *)aa; + register PyUnicodeObject *b = (PyUnicodeObject *)bb; + + if (a->length != b->length) + return 0; + if (a->length == 0) + return 1; + if (a->str[0] != b->str[0]) + return 0; + if (a->length == 1) + return 1; + return memcmp(a->str, b->str, a->length * sizeof(Py_UNICODE)) == 0; +} diff --git a/Objects/stringobject.c b/Objects/stringobject.c index 855f6cd..619804b 100644 --- a/Objects/stringobject.c +++ b/Objects/stringobject.c @@ -877,16 +877,6 @@ string_richcompare(PyStringObject *a, PyStringObject *b, int op) return result; } -int -_PyString_Eq(PyObject *o1, PyObject *o2) -{ - PyStringObject *a = (PyStringObject*) o1; - PyStringObject *b = (PyStringObject*) o2; - return Py_Size(a) == Py_Size(b) - && *a->ob_sval == *b->ob_sval - && memcmp(a->ob_sval, b->ob_sval, Py_Size(a)) == 0; -} - static long string_hash(PyStringObject *a) { -- cgit v0.12