summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGuido van Rossum <guido@python.org>2007-09-10 16:53:45 (GMT)
committerGuido van Rossum <guido@python.org>2007-09-10 16:53:45 (GMT)
commit8f95067915fc1cd74c8ef31fccc84796cd45fff3 (patch)
tree3a2ab4404185e1d15badcaa9e8751d4f19380b37
parent954c31bcc737c99b597870b42611228ed05ac699 (diff)
downloadcpython-8f95067915fc1cd74c8ef31fccc84796cd45fff3.zip
cpython-8f95067915fc1cd74c8ef31fccc84796cd45fff3.tar.gz
cpython-8f95067915fc1cd74c8ef31fccc84796cd45fff3.tar.bz2
Bug # 1125 (my code).
Support bytes.split() and bytes.strip() -- these split/strip using ASCII whitespace (tab, space, CR, LF, FF, VT) like their str counterparts. Also for rsplit(), lstrip() and rstrip(). And change all these functions to accept arbitrary buffer-API-supporting arguments. With unit tests.
-rw-r--r--Lib/test/test_bytes.py54
-rw-r--r--Objects/bytesobject.c238
2 files changed, 233 insertions, 59 deletions
diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py
index 1c161bb..cfcd282 100644
--- a/Lib/test/test_bytes.py
+++ b/Lib/test/test_bytes.py
@@ -617,16 +617,46 @@ class BytesTest(unittest.TestCase):
self.assertEqual(b.split(b'i'), [b'm', b'ss', b'ss', b'pp', b''])
self.assertEqual(b.split(b'ss'), [b'mi', b'i', b'ippi'])
self.assertEqual(b.split(b'w'), [b])
- # require an arg (no magic whitespace split)
- self.assertRaises(TypeError, b.split)
+
+ def test_split_whitespace(self):
+ for b in (b' arf barf ', b'arf\tbarf', b'arf\nbarf', b'arf\rbarf',
+ b'arf\fbarf', b'arf\vbarf'):
+ self.assertEqual(b.split(), [b'arf', b'barf'])
+ self.assertEqual(b.split(None), [b'arf', b'barf'])
+ self.assertEqual(b.split(None, 2), [b'arf', b'barf'])
+ self.assertEqual(b' a bb c '.split(None, 0), [b'a bb c '])
+ self.assertEqual(b' a bb c '.split(None, 1), [b'a', b'bb c '])
+ self.assertEqual(b' a bb c '.split(None, 2), [b'a', b'bb', b'c '])
+ self.assertEqual(b' a bb c '.split(None, 3), [b'a', b'bb', b'c'])
+
+ def test_split_buffer(self):
+ self.assertEqual(b'a b'.split(buffer(b' ')), [b'a', b'b'])
+
+ def test_split_string_error(self):
+ self.assertRaises(TypeError, b'a b'.split, ' ')
def test_rsplit(self):
b = b'mississippi'
self.assertEqual(b.rsplit(b'i'), [b'm', b'ss', b'ss', b'pp', b''])
self.assertEqual(b.rsplit(b'ss'), [b'mi', b'i', b'ippi'])
self.assertEqual(b.rsplit(b'w'), [b])
- # require an arg (no magic whitespace split)
- self.assertRaises(TypeError, b.rsplit)
+
+ def test_rsplit_whitespace(self):
+ for b in (b' arf barf ', b'arf\tbarf', b'arf\nbarf', b'arf\rbarf',
+ b'arf\fbarf', b'arf\vbarf'):
+ self.assertEqual(b.rsplit(), [b'arf', b'barf'])
+ self.assertEqual(b.rsplit(None), [b'arf', b'barf'])
+ self.assertEqual(b.rsplit(None, 2), [b'arf', b'barf'])
+ self.assertEqual(b' a bb c '.rsplit(None, 0), [b' a bb c'])
+ self.assertEqual(b' a bb c '.rsplit(None, 1), [b' a bb', b'c'])
+ self.assertEqual(b' a bb c '.rsplit(None,2), [b' a', b'bb', b'c'])
+ self.assertEqual(b' a bb c '.rsplit(None, 3), [b'a', b'bb', b'c'])
+
+ def test_rplit_buffer(self):
+ self.assertEqual(b'a b'.rsplit(buffer(b' ')), [b'a', b'b'])
+
+ def test_rplit_string_error(self):
+ self.assertRaises(TypeError, b'a b'.rsplit, ' ')
def test_partition(self):
b = b'mississippi'
@@ -670,6 +700,22 @@ class BytesTest(unittest.TestCase):
self.assertEqual(b.rstrip(b'im'), b'mississipp')
self.assertEqual(b.rstrip(b'pim'), b'mississ')
+ def test_strip_whitespace(self):
+ b = b' \t\n\r\f\vabc \t\n\r\f\v'
+ self.assertEqual(b.strip(), b'abc')
+ self.assertEqual(b.lstrip(), b'abc \t\n\r\f\v')
+ self.assertEqual(b.rstrip(), b' \t\n\r\f\vabc')
+
+ def test_strip_buffer(self):
+ self.assertEqual(b'abc'.strip(buffer(b'ac')), b'b')
+ self.assertEqual(b'abc'.lstrip(buffer(b'ac')), b'bc')
+ self.assertEqual(b'abc'.rstrip(buffer(b'ac')), b'ab')
+
+ def test_strip_string_error(self):
+ self.assertRaises(TypeError, b'abc'.strip, 'b')
+ self.assertRaises(TypeError, b'abc'.lstrip, 'b')
+ self.assertRaises(TypeError, b'abc'.rstrip, 'b')
+
def test_ord(self):
b = b'\0A\x7f\x80\xff'
self.assertEqual([ord(b[i:i+1]) for i in range(len(b))],
diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c
index 930b761..1486c5e 100644
--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c
@@ -2104,7 +2104,7 @@ bytes_replace(PyBytesObject *self, PyObject *args)
Py_LOCAL_INLINE(PyObject *)
split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
{
- register Py_ssize_t i, j, count=0;
+ register Py_ssize_t i, j, count = 0;
PyObject *str;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
@@ -2113,7 +2113,7 @@ split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
i = j = 0;
while ((j < len) && (maxcount-- > 0)) {
- for(; j<len; j++) {
+ for(; j < len; j++) {
/* I found that using memchr makes no difference */
if (s[j] == ch) {
SPLIT_ADD(s, i, j);
@@ -2133,46 +2133,91 @@ split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
return NULL;
}
+#define ISSPACE(c) (isspace(Py_CHARMASK(c)) && ((c) & 0x80) == 0)
+
+Py_LOCAL_INLINE(PyObject *)
+split_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxcount)
+{
+ register Py_ssize_t i, j, count = 0;
+ PyObject *str;
+ PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
+
+ if (list == NULL)
+ return NULL;
+
+ for (i = j = 0; i < len; ) {
+ /* find a token */
+ while (i < len && ISSPACE(s[i]))
+ i++;
+ j = i;
+ while (i < len && !ISSPACE(s[i]))
+ i++;
+ if (j < i) {
+ if (maxcount-- <= 0)
+ break;
+ SPLIT_ADD(s, j, i);
+ while (i < len && ISSPACE(s[i]))
+ i++;
+ j = i;
+ }
+ }
+ if (j < len) {
+ SPLIT_ADD(s, j, len);
+ }
+ FIX_PREALLOC_SIZE(list);
+ return list;
+
+ onError:
+ Py_DECREF(list);
+ return NULL;
+}
+
PyDoc_STRVAR(split__doc__,
-"B.split(sep [,maxsplit]) -> list of bytes\n\
+"B.split([sep [, maxsplit]]) -> list of bytes\n\
\n\
-Return a list of the bytes in the string B, using sep as the\n\
-delimiter. If maxsplit is given, at most maxsplit\n\
-splits are done.");
+Return a list of the bytes in the string B, using sep as the delimiter.\n\
+If sep is not given, B is split on ASCII whitespace charcters\n\
+(space, tab, return, newline, formfeed, vertical tab).\n\
+If maxsplit is given, at most maxsplit splits are done.");
static PyObject *
bytes_split(PyBytesObject *self, PyObject *args)
{
Py_ssize_t len = PyBytes_GET_SIZE(self), n, i, j;
- Py_ssize_t maxsplit = -1, count=0;
+ Py_ssize_t maxsplit = -1, count = 0;
const char *s = PyBytes_AS_STRING(self), *sub;
- PyObject *list, *str, *subobj;
+ PyObject *list, *str, *subobj = Py_None;
+ PyBuffer vsub;
#ifdef USE_FAST
Py_ssize_t pos;
#endif
- if (!PyArg_ParseTuple(args, "O|n:split", &subobj, &maxsplit))
+ if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit))
return NULL;
if (maxsplit < 0)
maxsplit = PY_SSIZE_T_MAX;
- if (PyBytes_Check(subobj)) {
- sub = PyBytes_AS_STRING(subobj);
- n = PyBytes_GET_SIZE(subobj);
- }
- /* XXX -> use the modern buffer interface */
- else if (PyObject_AsCharBuffer(subobj, &sub, &n))
+
+ if (subobj == Py_None)
+ return split_whitespace(s, len, maxsplit);
+
+ if (_getbuffer(subobj, &vsub) < 0)
return NULL;
+ sub = vsub.buf;
+ n = vsub.len;
if (n == 0) {
PyErr_SetString(PyExc_ValueError, "empty separator");
+ PyObject_ReleaseBuffer(subobj, &vsub);
return NULL;
}
- else if (n == 1)
+ if (n == 1)
return split_char(s, len, sub[0], maxsplit);
list = PyList_New(PREALLOC_SIZE(maxsplit));
- if (list == NULL)
+ if (list == NULL) {
+ PyObject_ReleaseBuffer(subobj, &vsub);
return NULL;
+ }
#ifdef USE_FAST
i = j = 0;
@@ -2198,10 +2243,12 @@ bytes_split(PyBytesObject *self, PyObject *args)
#endif
SPLIT_ADD(s, i, len);
FIX_PREALLOC_SIZE(list);
+ PyObject_ReleaseBuffer(subobj, &vsub);
return list;
onError:
Py_DECREF(list);
+ PyObject_ReleaseBuffer(subobj, &vsub);
return NULL;
}
@@ -2293,44 +2340,90 @@ rsplit_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
return NULL;
}
+Py_LOCAL_INLINE(PyObject *)
+rsplit_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxcount)
+{
+ register Py_ssize_t i, j, count = 0;
+ PyObject *str;
+ PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
+
+ if (list == NULL)
+ return NULL;
+
+ for (i = j = len - 1; i >= 0; ) {
+ /* find a token */
+ while (i >= 0 && Py_UNICODE_ISSPACE(s[i]))
+ i--;
+ j = i;
+ while (i >= 0 && !Py_UNICODE_ISSPACE(s[i]))
+ i--;
+ if (j > i) {
+ if (maxcount-- <= 0)
+ break;
+ SPLIT_ADD(s, i + 1, j + 1);
+ while (i >= 0 && Py_UNICODE_ISSPACE(s[i]))
+ i--;
+ j = i;
+ }
+ }
+ if (j >= 0) {
+ SPLIT_ADD(s, 0, j + 1);
+ }
+ FIX_PREALLOC_SIZE(list);
+ if (PyList_Reverse(list) < 0)
+ goto onError;
+
+ return list;
+
+ onError:
+ Py_DECREF(list);
+ return NULL;
+}
+
PyDoc_STRVAR(rsplit__doc__,
"B.rsplit(sep [,maxsplit]) -> list of bytes\n\
\n\
-Return a list of the sections in the byte B, using sep as the\n\
-delimiter, starting at the end of the bytes and working\n\
-to the front. If maxsplit is given, at most maxsplit splits are\n\
-done.");
+Return a list of the sections in the byte B, using sep as the delimiter,\n\
+starting at the end of the bytes and working to the front.\n\
+If sep is not given, B is split on ASCII whitespace characters\n\
+(space, tab, return, newline, formfeed, vertical tab).\n\
+If maxsplit is given, at most maxsplit splits are done.");
static PyObject *
bytes_rsplit(PyBytesObject *self, PyObject *args)
{
Py_ssize_t len = PyBytes_GET_SIZE(self), n, i, j;
- Py_ssize_t maxsplit = -1, count=0;
+ Py_ssize_t maxsplit = -1, count = 0;
const char *s = PyBytes_AS_STRING(self), *sub;
- PyObject *list, *str, *subobj;
+ PyObject *list, *str, *subobj = Py_None;
+ PyBuffer vsub;
- if (!PyArg_ParseTuple(args, "O|n:rsplit", &subobj, &maxsplit))
+ if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
return NULL;
if (maxsplit < 0)
maxsplit = PY_SSIZE_T_MAX;
- if (PyBytes_Check(subobj)) {
- sub = PyBytes_AS_STRING(subobj);
- n = PyBytes_GET_SIZE(subobj);
- }
- /* XXX -> Use the modern buffer interface */
- else if (PyObject_AsCharBuffer(subobj, &sub, &n))
+
+ if (subobj == Py_None)
+ return rsplit_whitespace(s, len, maxsplit);
+
+ if (_getbuffer(subobj, &vsub) < 0)
return NULL;
+ sub = vsub.buf;
+ n = vsub.len;
if (n == 0) {
PyErr_SetString(PyExc_ValueError, "empty separator");
+ PyObject_ReleaseBuffer(subobj, &vsub);
return NULL;
}
else if (n == 1)
return rsplit_char(s, len, sub[0], maxsplit);
list = PyList_New(PREALLOC_SIZE(maxsplit));
- if (list == NULL)
+ if (list == NULL) {
+ PyObject_ReleaseBuffer(subobj, &vsub);
return NULL;
+ }
j = len;
i = j - n;
@@ -2349,10 +2442,12 @@ bytes_rsplit(PyBytesObject *self, PyObject *args)
FIX_PREALLOC_SIZE(list);
if (PyList_Reverse(list) < 0)
goto onError;
+ PyObject_ReleaseBuffer(subobj, &vsub);
return list;
onError:
Py_DECREF(list);
+ PyObject_ReleaseBuffer(subobj, &vsub);
return NULL;
}
@@ -2542,71 +2637,104 @@ rstrip_helper(unsigned char *myptr, Py_ssize_t mysize,
}
PyDoc_STRVAR(strip__doc__,
-"B.strip(bytes) -> bytes\n\
+"B.strip([bytes]) -> bytes\n\
\n\
-Strip leading and trailing bytes contained in the argument.");
+Strip leading and trailing bytes contained in the argument.\n\
+If the argument is omitted, strip ASCII whitespace.");
static PyObject *
-bytes_strip(PyBytesObject *self, PyObject *arg)
+bytes_strip(PyBytesObject *self, PyObject *args)
{
Py_ssize_t left, right, mysize, argsize;
void *myptr, *argptr;
- if (arg == NULL || !PyBytes_Check(arg)) {
- PyErr_SetString(PyExc_TypeError, "strip() requires a bytes argument");
+ PyObject *arg = Py_None;
+ PyBuffer varg;
+ if (!PyArg_ParseTuple(args, "|O:strip", &arg))
return NULL;
+ if (arg == Py_None) {
+ argptr = "\t\n\r\f\v ";
+ argsize = 6;
+ }
+ else {
+ if (_getbuffer(arg, &varg) < 0)
+ return NULL;
+ argptr = varg.buf;
+ argsize = varg.len;
}
myptr = self->ob_bytes;
mysize = Py_Size(self);
- argptr = ((PyBytesObject *)arg)->ob_bytes;
- argsize = Py_Size(arg);
left = lstrip_helper(myptr, mysize, argptr, argsize);
if (left == mysize)
right = left;
else
right = rstrip_helper(myptr, mysize, argptr, argsize);
+ if (arg != Py_None)
+ PyObject_ReleaseBuffer(arg, &varg);
return PyBytes_FromStringAndSize(self->ob_bytes + left, right - left);
}
PyDoc_STRVAR(lstrip__doc__,
-"B.lstrip(bytes) -> bytes\n\
+"B.lstrip([bytes]) -> bytes\n\
\n\
-Strip leading bytes contained in the argument.");
+Strip leading bytes contained in the argument.\n\
+If the argument is omitted, strip leading ASCII whitespace.");
static PyObject *
-bytes_lstrip(PyBytesObject *self, PyObject *arg)
+bytes_lstrip(PyBytesObject *self, PyObject *args)
{
Py_ssize_t left, right, mysize, argsize;
void *myptr, *argptr;
- if (arg == NULL || !PyBytes_Check(arg)) {
- PyErr_SetString(PyExc_TypeError, "strip() requires a bytes argument");
+ PyObject *arg = Py_None;
+ PyBuffer varg;
+ if (!PyArg_ParseTuple(args, "|O:lstrip", &arg))
return NULL;
+ if (arg == Py_None) {
+ argptr = "\t\n\r\f\v ";
+ argsize = 6;
+ }
+ else {
+ if (_getbuffer(arg, &varg) < 0)
+ return NULL;
+ argptr = varg.buf;
+ argsize = varg.len;
}
myptr = self->ob_bytes;
mysize = Py_Size(self);
- argptr = ((PyBytesObject *)arg)->ob_bytes;
- argsize = Py_Size(arg);
left = lstrip_helper(myptr, mysize, argptr, argsize);
right = mysize;
+ if (arg != Py_None)
+ PyObject_ReleaseBuffer(arg, &varg);
return PyBytes_FromStringAndSize(self->ob_bytes + left, right - left);
}
PyDoc_STRVAR(rstrip__doc__,
-"B.rstrip(bytes) -> bytes\n\
+"B.rstrip([bytes]) -> bytes\n\
\n\
-Strip trailing bytes contained in the argument.");
+Strip trailing bytes contained in the argument.\n\
+If the argument is omitted, strip trailing ASCII whitespace.");
static PyObject *
-bytes_rstrip(PyBytesObject *self, PyObject *arg)
+bytes_rstrip(PyBytesObject *self, PyObject *args)
{
Py_ssize_t left, right, mysize, argsize;
void *myptr, *argptr;
- if (arg == NULL || !PyBytes_Check(arg)) {
- PyErr_SetString(PyExc_TypeError, "strip() requires a bytes argument");
+ PyObject *arg = Py_None;
+ PyBuffer varg;
+ if (!PyArg_ParseTuple(args, "|O:rstrip", &arg))
return NULL;
+ if (arg == Py_None) {
+ argptr = "\t\n\r\f\v ";
+ argsize = 6;
+ }
+ else {
+ if (_getbuffer(arg, &varg) < 0)
+ return NULL;
+ argptr = varg.buf;
+ argsize = varg.len;
}
myptr = self->ob_bytes;
mysize = Py_Size(self);
- argptr = ((PyBytesObject *)arg)->ob_bytes;
- argsize = Py_Size(arg);
left = 0;
right = rstrip_helper(myptr, mysize, argptr, argsize);
+ if (arg != Py_None)
+ PyObject_ReleaseBuffer(arg, &varg);
return PyBytes_FromStringAndSize(self->ob_bytes + left, right - left);
}
@@ -2839,9 +2967,9 @@ bytes_methods[] = {
{"reverse", (PyCFunction)bytes_reverse, METH_NOARGS, reverse__doc__},
{"pop", (PyCFunction)bytes_pop, METH_VARARGS, pop__doc__},
{"remove", (PyCFunction)bytes_remove, METH_O, remove__doc__},
- {"strip", (PyCFunction)bytes_strip, METH_O, strip__doc__},
- {"lstrip", (PyCFunction)bytes_lstrip, METH_O, lstrip__doc__},
- {"rstrip", (PyCFunction)bytes_rstrip, METH_O, rstrip__doc__},
+ {"strip", (PyCFunction)bytes_strip, METH_VARARGS, strip__doc__},
+ {"lstrip", (PyCFunction)bytes_lstrip, METH_VARARGS, lstrip__doc__},
+ {"rstrip", (PyCFunction)bytes_rstrip, METH_VARARGS, rstrip__doc__},
{"decode", (PyCFunction)bytes_decode, METH_VARARGS, decode_doc},
{"__alloc__", (PyCFunction)bytes_alloc, METH_NOARGS, alloc_doc},
{"fromhex", (PyCFunction)bytes_fromhex, METH_VARARGS|METH_CLASS,