diff options
-rw-r--r-- | Doc/library/stdtypes.rst | 29 | ||||
-rw-r--r-- | Lib/test/test_unicode.py | 37 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 150 |
3 files changed, 155 insertions, 61 deletions
diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst index 5c69ed6..4f09205 100644 --- a/Doc/library/stdtypes.rst +++ b/Doc/library/stdtypes.rst @@ -800,6 +800,21 @@ functions based on regular expressions. 'example.com' +.. method:: str.maketrans(x[, y[, z]]) + + This static method returns a translation table usable for :meth:`str.translate`. + + If there is only one argument, it must be a dictionary mapping Unicode + ordinals (integers) or characters (strings of length 1) to Unicode ordinals, + strings (of arbitrary lengths) or None. Character keys will then be + converted to ordinals. + + If there are two arguments, they must be strings of equal length, and in the + resulting dictionary, each character in x will be mapped to the character at + the same position in y. If there is a third argument, it must be a string, + whose characters will be mapped to None in the result. + + .. method:: str.partition(sep) Split the string at the first occurrence of *sep*, and return a 3-tuple @@ -934,15 +949,17 @@ functions based on regular expressions. .. method:: str.translate(map) Return a copy of the *s* where all characters have been mapped through the - *map* which must be a dictionary of characters (strings of length 1) or - Unicode ordinals (integers) to Unicode ordinals, strings or ``None``. - Unmapped characters are left untouched. Characters mapped to ``None`` are - deleted. + *map* which must be a dictionary of Unicode ordinals(integers) to Unicode + ordinals, strings or ``None``. Unmapped characters are left untouched. + Characters mapped to ``None`` are deleted. + + A *map* for :meth:`translate` is usually best created by + :meth:`str.maketrans`. .. note:: - A more flexible approach is to create a custom character mapping codec - using the :mod:`codecs` module (see :mod:`encodings.cp1251` for an + An even more flexible approach is to create a custom character mapping + codec using the :mod:`codecs` module (see :mod:`encodings.cp1251` for an example). diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 7475535..fe4eb85 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -166,18 +166,37 @@ class UnicodeTest( self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, 8) self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, -1) - def test_translate(self): - self.checkequalnofix('bbbc', 'abababc', 'translate', {ord('a'):None}) - self.checkequalnofix('iiic', 'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')}) - self.checkequalnofix('iiix', 'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):'x'}) - self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', {'a':None, 'b':'<i>'}) - self.checkequalnofix('c', 'abababc', 'translate', {ord('a'):None, ord('b'):''}) - self.checkequalnofix('xyyx', 'xzx', 'translate', {ord('z'):'yy'}) + def test_maketrans_translate(self): + # these work with plain translate() + self.checkequalnofix('bbbc', 'abababc', 'translate', + {ord('a'): None}) + self.checkequalnofix('iiic', 'abababc', 'translate', + {ord('a'): None, ord('b'): ord('i')}) + self.checkequalnofix('iiix', 'abababc', 'translate', + {ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'}) + self.checkequalnofix('c', 'abababc', 'translate', + {ord('a'): None, ord('b'): ''}) + self.checkequalnofix('xyyx', 'xzx', 'translate', + {ord('z'): 'yy'}) + # this needs maketrans() + self.checkequalnofix('abababc', 'abababc', 'translate', + {'b': '<i>'}) + tbl = self.type2test.maketrans({'a': None, 'b': '<i>'}) + self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl) + # test alternative way of calling maketrans() + tbl = self.type2test.maketrans('abc', 'xyz', 'd') + self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl) + + self.assertRaises(TypeError, self.type2test.maketrans) + self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg') + self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def') + self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2) + self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2) + self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2}) + self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2}) self.assertRaises(TypeError, 'hello'.translate) self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz') - self.assertRaises(ValueError, 'abababc'.translate, {'xy':2}) - self.assertRaises(TypeError, 'abababc'.translate, {(1,):2}) def test_split(self): string_tests.CommonTest.test_split(self) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 02b0c7a..205576f 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -7793,68 +7793,124 @@ unicode_swapcase(PyUnicodeObject *self) return fixup(self, fixswapcase); } -PyDoc_STRVAR(translate__doc__, -"S.translate(table) -> unicode\n\ +PyDoc_STRVAR(maketrans__doc__, +"str.maketrans(x[, y[, z]]) -> dict (static method)\n\ \n\ -Return a copy of the string S, where all characters have been mapped\n\ -through the given translation table, which must be a mapping of\n\ -Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\ -Unmapped characters are left untouched. Characters mapped to None\n\ -are deleted."); +Return a translation table usable for str.translate().\n\ +If there is only one argument, it must be a dictionary mapping Unicode\n\ +ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ +Character keys will then be converted to ordinals.\n\ +If there are two arguments, they must be strings of equal length, and\n\ +in the resulting dictionary, each character in x will be mapped to the\n\ +character at the same position in y. If there is a third argument, it\n\ +must be a string, whose characters will be mapped to None in the result."); static PyObject* -unicode_translate(PyUnicodeObject *self, PyObject *table) +unicode_maketrans(PyUnicodeObject *null, PyObject *args) { - PyObject *newtable = NULL; + PyObject *x, *y = NULL, *z = NULL; + PyObject *new = NULL, *key, *value; Py_ssize_t i = 0; - PyObject *key, *value, *result; - - if (!PyDict_Check(table)) { - PyErr_SetString(PyExc_TypeError, "translate argument must be a dict"); + int res; + + if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) return NULL; - } - /* fixup the table -- allow size-1 string keys instead of only int keys */ - newtable = PyDict_Copy(table); - if (!newtable) return NULL; - while (PyDict_Next(table, &i, &key, &value)) { - if (PyUnicode_Check(key)) { - /* convert string keys to integer keys */ - PyObject *newkey; - int res; - if (PyUnicode_GET_SIZE(key) != 1) { - PyErr_SetString(PyExc_ValueError, "string items in translate " - "table must be 1 element long"); - goto err; - } - newkey = PyInt_FromLong(PyUnicode_AS_UNICODE(key)[0]); - if (!newkey) + new = PyDict_New(); + if (!new) + return NULL; + if (y != NULL) { + /* x must be a string too, of equal length */ + Py_ssize_t ylen = PyUnicode_GET_SIZE(y); + if (!PyUnicode_Check(x)) { + PyErr_SetString(PyExc_TypeError, "first maketrans argument must " + "be a string if there is a second argument"); + goto err; + } + if (PyUnicode_GET_SIZE(x) != ylen) { + PyErr_SetString(PyExc_ValueError, "the first two maketrans " + "arguments must have equal length"); + goto err; + } + /* create entries for translating chars in x to those in y */ + for (i = 0; i < PyUnicode_GET_SIZE(x); i++) { + key = PyInt_FromLong(PyUnicode_AS_UNICODE(x)[i]); + value = PyInt_FromLong(PyUnicode_AS_UNICODE(y)[i]); + if (!key || !value) goto err; - res = PyDict_SetItem(newtable, newkey, value); - Py_DECREF(newkey); + res = PyDict_SetItem(new, key, value); + Py_DECREF(key); + Py_DECREF(value); if (res < 0) goto err; - } else if (PyInt_Check(key)) { - /* just keep integer keys */ - if (PyDict_SetItem(newtable, key, value) < 0) - goto err; - } else { - PyErr_SetString(PyExc_TypeError, "items in translate table must be " - "strings or integers"); + } + /* create entries for deleting chars in z */ + if (z != NULL) { + for (i = 0; i < PyUnicode_GET_SIZE(z); i++) { + key = PyInt_FromLong(PyUnicode_AS_UNICODE(z)[i]); + if (!key) + goto err; + res = PyDict_SetItem(new, key, Py_None); + Py_DECREF(key); + if (res < 0) + goto err; + } + } + } else { + /* x must be a dict */ + if (!PyDict_Check(x)) { + PyErr_SetString(PyExc_TypeError, "if you give only one argument " + "to maketrans it must be a dict"); goto err; } + /* copy entries into the new dict, converting string keys to int keys */ + while (PyDict_Next(x, &i, &key, &value)) { + if (PyUnicode_Check(key)) { + /* convert string keys to integer keys */ + PyObject *newkey; + if (PyUnicode_GET_SIZE(key) != 1) { + PyErr_SetString(PyExc_ValueError, "string keys in translate " + "table must be of length 1"); + goto err; + } + newkey = PyInt_FromLong(PyUnicode_AS_UNICODE(key)[0]); + if (!newkey) + goto err; + res = PyDict_SetItem(new, newkey, value); + Py_DECREF(newkey); + if (res < 0) + goto err; + } else if (PyInt_Check(key)) { + /* just keep integer keys */ + if (PyDict_SetItem(new, key, value) < 0) + goto err; + } else { + PyErr_SetString(PyExc_TypeError, "keys in translate table must " + "be strings or integers"); + goto err; + } + } } - - result = PyUnicode_TranslateCharmap(self->str, - self->length, - newtable, - "ignore"); - Py_DECREF(newtable); - return result; + return new; err: - Py_DECREF(newtable); + Py_DECREF(new); return NULL; } +PyDoc_STRVAR(translate__doc__, +"S.translate(table) -> unicode\n\ +\n\ +Return a copy of the string S, where all characters have been mapped\n\ +through the given translation table, which must be a mapping of\n\ +Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\ +Unmapped characters are left untouched. Characters mapped to None\n\ +are deleted."); + +static PyObject* +unicode_translate(PyUnicodeObject *self, PyObject *table) +{ + return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore"); +} + PyDoc_STRVAR(upper__doc__, "S.upper() -> unicode\n\ \n\ @@ -8076,6 +8132,8 @@ static PyMethodDef unicode_methods[] = { {"__format__", (PyCFunction) unicode_unicode__format__, METH_VARARGS, p_format__doc__}, {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS}, {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS}, + {"maketrans", (PyCFunction) unicode_maketrans, + METH_VARARGS | METH_STATIC, maketrans__doc__}, #if 0 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, #endif |