diff options
author | Guido van Rossum <guido@python.org> | 2007-09-10 16:53:45 (GMT) |
---|---|---|
committer | Guido van Rossum <guido@python.org> | 2007-09-10 16:53:45 (GMT) |
commit | 8f95067915fc1cd74c8ef31fccc84796cd45fff3 (patch) | |
tree | 3a2ab4404185e1d15badcaa9e8751d4f19380b37 | |
parent | 954c31bcc737c99b597870b42611228ed05ac699 (diff) | |
download | cpython-8f95067915fc1cd74c8ef31fccc84796cd45fff3.zip cpython-8f95067915fc1cd74c8ef31fccc84796cd45fff3.tar.gz cpython-8f95067915fc1cd74c8ef31fccc84796cd45fff3.tar.bz2 |
Bug # 1125 (my code).
Support bytes.split() and bytes.strip() -- these split/strip using ASCII
whitespace (tab, space, CR, LF, FF, VT) like their str counterparts.
Also for rsplit(), lstrip() and rstrip().
And change all these functions to accept arbitrary buffer-API-supporting
arguments.
With unit tests.
-rw-r--r-- | Lib/test/test_bytes.py | 54 | ||||
-rw-r--r-- | Objects/bytesobject.c | 238 |
2 files changed, 233 insertions, 59 deletions
diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py index 1c161bb..cfcd282 100644 --- a/Lib/test/test_bytes.py +++ b/Lib/test/test_bytes.py @@ -617,16 +617,46 @@ class BytesTest(unittest.TestCase): self.assertEqual(b.split(b'i'), [b'm', b'ss', b'ss', b'pp', b'']) self.assertEqual(b.split(b'ss'), [b'mi', b'i', b'ippi']) self.assertEqual(b.split(b'w'), [b]) - # require an arg (no magic whitespace split) - self.assertRaises(TypeError, b.split) + + def test_split_whitespace(self): + for b in (b' arf barf ', b'arf\tbarf', b'arf\nbarf', b'arf\rbarf', + b'arf\fbarf', b'arf\vbarf'): + self.assertEqual(b.split(), [b'arf', b'barf']) + self.assertEqual(b.split(None), [b'arf', b'barf']) + self.assertEqual(b.split(None, 2), [b'arf', b'barf']) + self.assertEqual(b' a bb c '.split(None, 0), [b'a bb c ']) + self.assertEqual(b' a bb c '.split(None, 1), [b'a', b'bb c ']) + self.assertEqual(b' a bb c '.split(None, 2), [b'a', b'bb', b'c ']) + self.assertEqual(b' a bb c '.split(None, 3), [b'a', b'bb', b'c']) + + def test_split_buffer(self): + self.assertEqual(b'a b'.split(buffer(b' ')), [b'a', b'b']) + + def test_split_string_error(self): + self.assertRaises(TypeError, b'a b'.split, ' ') def test_rsplit(self): b = b'mississippi' self.assertEqual(b.rsplit(b'i'), [b'm', b'ss', b'ss', b'pp', b'']) self.assertEqual(b.rsplit(b'ss'), [b'mi', b'i', b'ippi']) self.assertEqual(b.rsplit(b'w'), [b]) - # require an arg (no magic whitespace split) - self.assertRaises(TypeError, b.rsplit) + + def test_rsplit_whitespace(self): + for b in (b' arf barf ', b'arf\tbarf', b'arf\nbarf', b'arf\rbarf', + b'arf\fbarf', b'arf\vbarf'): + self.assertEqual(b.rsplit(), [b'arf', b'barf']) + self.assertEqual(b.rsplit(None), [b'arf', b'barf']) + self.assertEqual(b.rsplit(None, 2), [b'arf', b'barf']) + self.assertEqual(b' a bb c '.rsplit(None, 0), [b' a bb c']) + self.assertEqual(b' a bb c '.rsplit(None, 1), [b' a bb', b'c']) + self.assertEqual(b' a bb c '.rsplit(None,2), [b' a', b'bb', b'c']) + self.assertEqual(b' a bb c '.rsplit(None, 3), [b'a', b'bb', b'c']) + + def test_rplit_buffer(self): + self.assertEqual(b'a b'.rsplit(buffer(b' ')), [b'a', b'b']) + + def test_rplit_string_error(self): + self.assertRaises(TypeError, b'a b'.rsplit, ' ') def test_partition(self): b = b'mississippi' @@ -670,6 +700,22 @@ class BytesTest(unittest.TestCase): self.assertEqual(b.rstrip(b'im'), b'mississipp') self.assertEqual(b.rstrip(b'pim'), b'mississ') + def test_strip_whitespace(self): + b = b' \t\n\r\f\vabc \t\n\r\f\v' + self.assertEqual(b.strip(), b'abc') + self.assertEqual(b.lstrip(), b'abc \t\n\r\f\v') + self.assertEqual(b.rstrip(), b' \t\n\r\f\vabc') + + def test_strip_buffer(self): + self.assertEqual(b'abc'.strip(buffer(b'ac')), b'b') + self.assertEqual(b'abc'.lstrip(buffer(b'ac')), b'bc') + self.assertEqual(b'abc'.rstrip(buffer(b'ac')), b'ab') + + def test_strip_string_error(self): + self.assertRaises(TypeError, b'abc'.strip, 'b') + self.assertRaises(TypeError, b'abc'.lstrip, 'b') + self.assertRaises(TypeError, b'abc'.rstrip, 'b') + def test_ord(self): b = b'\0A\x7f\x80\xff' self.assertEqual([ord(b[i:i+1]) for i in range(len(b))], diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index 930b761..1486c5e 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -2104,7 +2104,7 @@ bytes_replace(PyBytesObject *self, PyObject *args) Py_LOCAL_INLINE(PyObject *) split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount) { - register Py_ssize_t i, j, count=0; + register Py_ssize_t i, j, count = 0; PyObject *str; PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); @@ -2113,7 +2113,7 @@ split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount) i = j = 0; while ((j < len) && (maxcount-- > 0)) { - for(; j<len; j++) { + for(; j < len; j++) { /* I found that using memchr makes no difference */ if (s[j] == ch) { SPLIT_ADD(s, i, j); @@ -2133,46 +2133,91 @@ split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount) return NULL; } +#define ISSPACE(c) (isspace(Py_CHARMASK(c)) && ((c) & 0x80) == 0) + +Py_LOCAL_INLINE(PyObject *) +split_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxcount) +{ + register Py_ssize_t i, j, count = 0; + PyObject *str; + PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + + if (list == NULL) + return NULL; + + for (i = j = 0; i < len; ) { + /* find a token */ + while (i < len && ISSPACE(s[i])) + i++; + j = i; + while (i < len && !ISSPACE(s[i])) + i++; + if (j < i) { + if (maxcount-- <= 0) + break; + SPLIT_ADD(s, j, i); + while (i < len && ISSPACE(s[i])) + i++; + j = i; + } + } + if (j < len) { + SPLIT_ADD(s, j, len); + } + FIX_PREALLOC_SIZE(list); + return list; + + onError: + Py_DECREF(list); + return NULL; +} + PyDoc_STRVAR(split__doc__, -"B.split(sep [,maxsplit]) -> list of bytes\n\ +"B.split([sep [, maxsplit]]) -> list of bytes\n\ \n\ -Return a list of the bytes in the string B, using sep as the\n\ -delimiter. If maxsplit is given, at most maxsplit\n\ -splits are done."); +Return a list of the bytes in the string B, using sep as the delimiter.\n\ +If sep is not given, B is split on ASCII whitespace charcters\n\ +(space, tab, return, newline, formfeed, vertical tab).\n\ +If maxsplit is given, at most maxsplit splits are done."); static PyObject * bytes_split(PyBytesObject *self, PyObject *args) { Py_ssize_t len = PyBytes_GET_SIZE(self), n, i, j; - Py_ssize_t maxsplit = -1, count=0; + Py_ssize_t maxsplit = -1, count = 0; const char *s = PyBytes_AS_STRING(self), *sub; - PyObject *list, *str, *subobj; + PyObject *list, *str, *subobj = Py_None; + PyBuffer vsub; #ifdef USE_FAST Py_ssize_t pos; #endif - if (!PyArg_ParseTuple(args, "O|n:split", &subobj, &maxsplit)) + if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit)) return NULL; if (maxsplit < 0) maxsplit = PY_SSIZE_T_MAX; - if (PyBytes_Check(subobj)) { - sub = PyBytes_AS_STRING(subobj); - n = PyBytes_GET_SIZE(subobj); - } - /* XXX -> use the modern buffer interface */ - else if (PyObject_AsCharBuffer(subobj, &sub, &n)) + + if (subobj == Py_None) + return split_whitespace(s, len, maxsplit); + + if (_getbuffer(subobj, &vsub) < 0) return NULL; + sub = vsub.buf; + n = vsub.len; if (n == 0) { PyErr_SetString(PyExc_ValueError, "empty separator"); + PyObject_ReleaseBuffer(subobj, &vsub); return NULL; } - else if (n == 1) + if (n == 1) return split_char(s, len, sub[0], maxsplit); list = PyList_New(PREALLOC_SIZE(maxsplit)); - if (list == NULL) + if (list == NULL) { + PyObject_ReleaseBuffer(subobj, &vsub); return NULL; + } #ifdef USE_FAST i = j = 0; @@ -2198,10 +2243,12 @@ bytes_split(PyBytesObject *self, PyObject *args) #endif SPLIT_ADD(s, i, len); FIX_PREALLOC_SIZE(list); + PyObject_ReleaseBuffer(subobj, &vsub); return list; onError: Py_DECREF(list); + PyObject_ReleaseBuffer(subobj, &vsub); return NULL; } @@ -2293,44 +2340,90 @@ rsplit_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount) return NULL; } +Py_LOCAL_INLINE(PyObject *) +rsplit_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxcount) +{ + register Py_ssize_t i, j, count = 0; + PyObject *str; + PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + + if (list == NULL) + return NULL; + + for (i = j = len - 1; i >= 0; ) { + /* find a token */ + while (i >= 0 && Py_UNICODE_ISSPACE(s[i])) + i--; + j = i; + while (i >= 0 && !Py_UNICODE_ISSPACE(s[i])) + i--; + if (j > i) { + if (maxcount-- <= 0) + break; + SPLIT_ADD(s, i + 1, j + 1); + while (i >= 0 && Py_UNICODE_ISSPACE(s[i])) + i--; + j = i; + } + } + if (j >= 0) { + SPLIT_ADD(s, 0, j + 1); + } + FIX_PREALLOC_SIZE(list); + if (PyList_Reverse(list) < 0) + goto onError; + + return list; + + onError: + Py_DECREF(list); + return NULL; +} + PyDoc_STRVAR(rsplit__doc__, "B.rsplit(sep [,maxsplit]) -> list of bytes\n\ \n\ -Return a list of the sections in the byte B, using sep as the\n\ -delimiter, starting at the end of the bytes and working\n\ -to the front. If maxsplit is given, at most maxsplit splits are\n\ -done."); +Return a list of the sections in the byte B, using sep as the delimiter,\n\ +starting at the end of the bytes and working to the front.\n\ +If sep is not given, B is split on ASCII whitespace characters\n\ +(space, tab, return, newline, formfeed, vertical tab).\n\ +If maxsplit is given, at most maxsplit splits are done."); static PyObject * bytes_rsplit(PyBytesObject *self, PyObject *args) { Py_ssize_t len = PyBytes_GET_SIZE(self), n, i, j; - Py_ssize_t maxsplit = -1, count=0; + Py_ssize_t maxsplit = -1, count = 0; const char *s = PyBytes_AS_STRING(self), *sub; - PyObject *list, *str, *subobj; + PyObject *list, *str, *subobj = Py_None; + PyBuffer vsub; - if (!PyArg_ParseTuple(args, "O|n:rsplit", &subobj, &maxsplit)) + if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit)) return NULL; if (maxsplit < 0) maxsplit = PY_SSIZE_T_MAX; - if (PyBytes_Check(subobj)) { - sub = PyBytes_AS_STRING(subobj); - n = PyBytes_GET_SIZE(subobj); - } - /* XXX -> Use the modern buffer interface */ - else if (PyObject_AsCharBuffer(subobj, &sub, &n)) + + if (subobj == Py_None) + return rsplit_whitespace(s, len, maxsplit); + + if (_getbuffer(subobj, &vsub) < 0) return NULL; + sub = vsub.buf; + n = vsub.len; if (n == 0) { PyErr_SetString(PyExc_ValueError, "empty separator"); + PyObject_ReleaseBuffer(subobj, &vsub); return NULL; } else if (n == 1) return rsplit_char(s, len, sub[0], maxsplit); list = PyList_New(PREALLOC_SIZE(maxsplit)); - if (list == NULL) + if (list == NULL) { + PyObject_ReleaseBuffer(subobj, &vsub); return NULL; + } j = len; i = j - n; @@ -2349,10 +2442,12 @@ bytes_rsplit(PyBytesObject *self, PyObject *args) FIX_PREALLOC_SIZE(list); if (PyList_Reverse(list) < 0) goto onError; + PyObject_ReleaseBuffer(subobj, &vsub); return list; onError: Py_DECREF(list); + PyObject_ReleaseBuffer(subobj, &vsub); return NULL; } @@ -2542,71 +2637,104 @@ rstrip_helper(unsigned char *myptr, Py_ssize_t mysize, } PyDoc_STRVAR(strip__doc__, -"B.strip(bytes) -> bytes\n\ +"B.strip([bytes]) -> bytes\n\ \n\ -Strip leading and trailing bytes contained in the argument."); +Strip leading and trailing bytes contained in the argument.\n\ +If the argument is omitted, strip ASCII whitespace."); static PyObject * -bytes_strip(PyBytesObject *self, PyObject *arg) +bytes_strip(PyBytesObject *self, PyObject *args) { Py_ssize_t left, right, mysize, argsize; void *myptr, *argptr; - if (arg == NULL || !PyBytes_Check(arg)) { - PyErr_SetString(PyExc_TypeError, "strip() requires a bytes argument"); + PyObject *arg = Py_None; + PyBuffer varg; + if (!PyArg_ParseTuple(args, "|O:strip", &arg)) return NULL; + if (arg == Py_None) { + argptr = "\t\n\r\f\v "; + argsize = 6; + } + else { + if (_getbuffer(arg, &varg) < 0) + return NULL; + argptr = varg.buf; + argsize = varg.len; } myptr = self->ob_bytes; mysize = Py_Size(self); - argptr = ((PyBytesObject *)arg)->ob_bytes; - argsize = Py_Size(arg); left = lstrip_helper(myptr, mysize, argptr, argsize); if (left == mysize) right = left; else right = rstrip_helper(myptr, mysize, argptr, argsize); + if (arg != Py_None) + PyObject_ReleaseBuffer(arg, &varg); return PyBytes_FromStringAndSize(self->ob_bytes + left, right - left); } PyDoc_STRVAR(lstrip__doc__, -"B.lstrip(bytes) -> bytes\n\ +"B.lstrip([bytes]) -> bytes\n\ \n\ -Strip leading bytes contained in the argument."); +Strip leading bytes contained in the argument.\n\ +If the argument is omitted, strip leading ASCII whitespace."); static PyObject * -bytes_lstrip(PyBytesObject *self, PyObject *arg) +bytes_lstrip(PyBytesObject *self, PyObject *args) { Py_ssize_t left, right, mysize, argsize; void *myptr, *argptr; - if (arg == NULL || !PyBytes_Check(arg)) { - PyErr_SetString(PyExc_TypeError, "strip() requires a bytes argument"); + PyObject *arg = Py_None; + PyBuffer varg; + if (!PyArg_ParseTuple(args, "|O:lstrip", &arg)) return NULL; + if (arg == Py_None) { + argptr = "\t\n\r\f\v "; + argsize = 6; + } + else { + if (_getbuffer(arg, &varg) < 0) + return NULL; + argptr = varg.buf; + argsize = varg.len; } myptr = self->ob_bytes; mysize = Py_Size(self); - argptr = ((PyBytesObject *)arg)->ob_bytes; - argsize = Py_Size(arg); left = lstrip_helper(myptr, mysize, argptr, argsize); right = mysize; + if (arg != Py_None) + PyObject_ReleaseBuffer(arg, &varg); return PyBytes_FromStringAndSize(self->ob_bytes + left, right - left); } PyDoc_STRVAR(rstrip__doc__, -"B.rstrip(bytes) -> bytes\n\ +"B.rstrip([bytes]) -> bytes\n\ \n\ -Strip trailing bytes contained in the argument."); +Strip trailing bytes contained in the argument.\n\ +If the argument is omitted, strip trailing ASCII whitespace."); static PyObject * -bytes_rstrip(PyBytesObject *self, PyObject *arg) +bytes_rstrip(PyBytesObject *self, PyObject *args) { Py_ssize_t left, right, mysize, argsize; void *myptr, *argptr; - if (arg == NULL || !PyBytes_Check(arg)) { - PyErr_SetString(PyExc_TypeError, "strip() requires a bytes argument"); + PyObject *arg = Py_None; + PyBuffer varg; + if (!PyArg_ParseTuple(args, "|O:rstrip", &arg)) return NULL; + if (arg == Py_None) { + argptr = "\t\n\r\f\v "; + argsize = 6; + } + else { + if (_getbuffer(arg, &varg) < 0) + return NULL; + argptr = varg.buf; + argsize = varg.len; } myptr = self->ob_bytes; mysize = Py_Size(self); - argptr = ((PyBytesObject *)arg)->ob_bytes; - argsize = Py_Size(arg); left = 0; right = rstrip_helper(myptr, mysize, argptr, argsize); + if (arg != Py_None) + PyObject_ReleaseBuffer(arg, &varg); return PyBytes_FromStringAndSize(self->ob_bytes + left, right - left); } @@ -2839,9 +2967,9 @@ bytes_methods[] = { {"reverse", (PyCFunction)bytes_reverse, METH_NOARGS, reverse__doc__}, {"pop", (PyCFunction)bytes_pop, METH_VARARGS, pop__doc__}, {"remove", (PyCFunction)bytes_remove, METH_O, remove__doc__}, - {"strip", (PyCFunction)bytes_strip, METH_O, strip__doc__}, - {"lstrip", (PyCFunction)bytes_lstrip, METH_O, lstrip__doc__}, - {"rstrip", (PyCFunction)bytes_rstrip, METH_O, rstrip__doc__}, + {"strip", (PyCFunction)bytes_strip, METH_VARARGS, strip__doc__}, + {"lstrip", (PyCFunction)bytes_lstrip, METH_VARARGS, lstrip__doc__}, + {"rstrip", (PyCFunction)bytes_rstrip, METH_VARARGS, rstrip__doc__}, {"decode", (PyCFunction)bytes_decode, METH_VARARGS, decode_doc}, {"__alloc__", (PyCFunction)bytes_alloc, METH_NOARGS, alloc_doc}, {"fromhex", (PyCFunction)bytes_fromhex, METH_VARARGS|METH_CLASS, |