From 3ae811b57d227a220f207869487fd9251e278608 Mon Sep 17 00:00:00 2001 From: Hye-Shik Chang Date: Mon, 15 Dec 2003 18:49:53 +0000 Subject: Add rsplit method for str and unicode builtin types. SF feature request #801847. Original patch is written by Sean Reifschneider. --- Doc/lib/libstdtypes.tex | 18 +++++ Doc/lib/libstring.tex | 17 +++++ Include/unicodeobject.h | 20 +++++ Lib/string.py | 12 +++ Lib/test/string_tests.py | 20 +++++ Objects/stringobject.c | 124 ++++++++++++++++++++++++++++++ Objects/unicodeobject.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++- 7 files changed, 402 insertions(+), 1 deletion(-) diff --git a/Doc/lib/libstdtypes.tex b/Doc/lib/libstdtypes.tex index 7feda60..4ce6ec5 100644 --- a/Doc/lib/libstdtypes.tex +++ b/Doc/lib/libstdtypes.tex @@ -694,6 +694,24 @@ The original string is returned if \versionchanged[Support for the \var{fillchar} argument]{2.4} \end{methoddesc} +\begin{methoddesc}[string]{rsplit}{\optional{, sep\optional{, maxsplit}}} +Return a list of the words of the string, scanning the string from +the end working forward. The resulting list of words is in the +same order as \function{split()}. If the optional second argument +\var{sep} is absent or \code{None}, the words are separated by +arbitrary strings of whitespace characters (space, tab, newline, +return, formfeed). If the second argument \var{sep} is present and +not \code{None}, it specifies a string to be used as the word +separator. The returned list will then have one more item than the +number of non-overlapping occurrences of the separator in the string. +The optional third argument \var{maxsplit} defaults to 0. If it +is nonzero, at most \var{maxsplit} number of splits occur, and the +remainder of the string is returned as the first element of the +list (thus, the list will have at most \code{\var{maxsplit}+1} +elements). +\versionadded{2.4} +\end{methoddesc} + \begin{methoddesc}[string]{rstrip}{\optional{chars}} Return a copy of the string with trailing characters removed. If \var{chars} is omitted or \code{None}, whitespace characters are diff --git a/Doc/lib/libstring.tex b/Doc/lib/libstring.tex index 3f902cf..11054e2 100644 --- a/Doc/lib/libstring.tex +++ b/Doc/lib/libstring.tex @@ -215,6 +215,23 @@ The functions defined in this module are: elements). \end{funcdesc} +\begin{funcdesc}{rsplit}{s\optional{, sep\optional{, maxsplit}}} + Return a list of the words of the string \var{s}, scanning \var{s} from + the end working forward. The resulting list of words is in the same + order as \function{split()}. If the optional second argument \var{sep} + is absent or \code{None}, the words are separated by arbitrary strings + of whitespace characters (space, tab, newline, return, formfeed). + If the second argument \var{sep} is present and not \code{None}, it + specifies a string to be used as the word separator. The returned + list will then have one more item than the number of non-overlapping + occurrences of the separator in the string. The optional third argument + \var{maxsplit} defaults to 0. If it is nonzero, at most \var{maxsplit} + number of splits occur, and the remainder of the string is returned + as the first element of the list (thus, the list will have at most + \code{\var{maxsplit}+1} elements). + \versionadded{2.4} +\end{funcdesc} + \begin{funcdesc}{splitfields}{s\optional{, sep\optional{, maxsplit}}} This function behaves identically to \function{split()}. (In the past, \function{split()} was only used with one argument, while diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 24e59de..1690f8a 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -185,6 +185,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_Resize PyUnicodeUCS2_Resize # define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding # define PyUnicode_Split PyUnicodeUCS2_Split +# define PyUnicode_RSplit PyUnicodeUCS2_RSplit # define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines # define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch # define PyUnicode_Translate PyUnicodeUCS2_Translate @@ -959,6 +960,25 @@ PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( int keepends /* If true, line end markers are included */ ); +/* Split a string giving a list of Unicode strings. + + If sep is NULL, splitting will be done at all whitespace + substrings. Otherwise, splits occur at the given separator. + + At most maxsplit splits will be done. But unlike PyUnicode_Split + PyUnicode_RSplit splits from the end of the string. If negative, + no limit is set. + + Separators are not included in the resulting list. + +*/ + +PyAPI_FUNC(PyObject*) PyUnicode_RSplit( + PyObject *s, /* String to split */ + PyObject *sep, /* String separator */ + int maxsplit /* Maxsplit count */ + ); + /* Translate a string by applying a character mapping table to it and return the resulting Unicode object. diff --git a/Lib/string.py b/Lib/string.py index 0a77f46..bc10c20 100644 --- a/Lib/string.py +++ b/Lib/string.py @@ -121,6 +121,18 @@ def split(s, sep=None, maxsplit=-1): return s.split(sep, maxsplit) splitfields = split +# Split a string into a list of space/tab-separated words +def rsplit(s, sep=None, maxsplit=-1): + """rsplit(s [,sep [,maxsplit]]) -> list of strings + + Return a list of the words in the string s, using sep as the + delimiter string, starting at the end of the string and working + to the front. If maxsplit is given, at most maxsplit splits are + done. If sep is not specified or is None, any whitespace string + is a separator. + """ + return s.rsplit(sep, maxsplit) + # Join fields with optional separator def join(words, sep = ' '): """join(list [,sep]) -> string diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 236c577..c3e53ad 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -189,6 +189,26 @@ class CommonTest(unittest.TestCase): self.checkraises(TypeError, 'hello', 'split', 42, 42, 42) + def test_rsplit(self): + self.checkequal(['this', 'is', 'the', 'rsplit', 'function'], + 'this is the rsplit function', 'rsplit') + self.checkequal(['a', 'b', 'c', 'd'], 'a|b|c|d', 'rsplit', '|') + self.checkequal(['a|b', 'c', 'd'], 'a|b|c|d', 'rsplit', '|', 2) + self.checkequal(['a b c', 'd'], 'a b c d', 'rsplit', None, 1) + self.checkequal(['a b', 'c', 'd'], 'a b c d', 'rsplit', None, 2) + self.checkequal(['a', 'b', 'c', 'd'], 'a b c d', 'rsplit', None, 3) + self.checkequal(['a', 'b', 'c', 'd'], 'a b c d', 'rsplit', None, 4) + self.checkequal(['a b c d'], 'a b c d', 'rsplit', None, 0) + self.checkequal(['a, b, c', 'd'], 'a, b, c, d', 'rsplit', ', ', 1) + self.checkequal(['a, b', 'c', 'd'], 'a, b, c, d', 'rsplit', ', ', 2) + self.checkequal(['a', 'b', 'c', 'd'], 'a, b, c, d', 'rsplit', ', ', 3) + self.checkequal(['a', 'b', 'c', 'd'], 'a, b, c, d', 'rsplit', ', ', 4) + self.checkequal(['a, b, c, d'], 'a, b, c, d', 'rsplit', ', ', 0) + self.checkequal(['a b', 'c', 'd'], 'a b c d', 'rsplit', None, 2) + self.checkequal(['a\x00b', 'c'], 'a\x00b\x00c', 'rsplit', '\x00', 1) + self.checkequal(['', ''], 'abcd', 'rsplit', 'abcd') + self.checkequal([u'a b', u'c', u'd'], 'a b c d', 'rsplit', u' ', 2) + def test_strip(self): self.checkequal('hello', ' hello ', 'strip') self.checkequal('hello ', ' hello ', 'lstrip') diff --git a/Objects/stringobject.c b/Objects/stringobject.c index d3351df..9512059 100644 --- a/Objects/stringobject.c +++ b/Objects/stringobject.c @@ -1407,6 +1407,129 @@ string_split(PyStringObject *self, PyObject *args) return NULL; } +static PyObject * +rsplit_whitespace(const char *s, int len, int maxsplit) +{ + int i, j, err; + PyObject* item; + PyObject *list = PyList_New(0); + + if (list == NULL) + return NULL; + + for (i = j = len - 1; i >= 0; ) { + while (i >= 0 && isspace(Py_CHARMASK(s[i]))) + i--; + j = i; + while (i >= 0 && !isspace(Py_CHARMASK(s[i]))) + i--; + if (j > i) { + if (maxsplit-- <= 0) + break; + item = PyString_FromStringAndSize(s+i+1, (int)(j-i)); + if (item == NULL) + goto finally; + err = PyList_Insert(list, 0, item); + Py_DECREF(item); + if (err < 0) + goto finally; + while (i >= 0 && isspace(Py_CHARMASK(s[i]))) + i--; + j = i; + } + } + if (j >= 0) { + item = PyString_FromStringAndSize(s, (int)(j + 1)); + if (item == NULL) + goto finally; + err = PyList_Insert(list, 0, item); + Py_DECREF(item); + if (err < 0) + goto finally; + } + return list; + finally: + Py_DECREF(list); + return NULL; +} + + +PyDoc_STRVAR(rsplit__doc__, +"S.rsplit([sep [,maxsplit]]) -> list of strings\n\ +\n\ +Return a list of the words in the string S, using sep as the\n\ +delimiter string, starting at the end of the string and working\n\ +to the front. If maxsplit is given, at most maxsplit splits are\n\ +done. If sep is not specified or is None, any whitespace string\n\ +is a separator."); + +static PyObject * +string_rsplit(PyStringObject *self, PyObject *args) +{ + int len = PyString_GET_SIZE(self), n, i, j, err; + int maxsplit = -1; + const char *s = PyString_AS_STRING(self), *sub; + PyObject *list, *item, *subobj = Py_None; + + if (!PyArg_ParseTuple(args, "|Oi:rsplit", &subobj, &maxsplit)) + return NULL; + if (maxsplit < 0) + maxsplit = INT_MAX; + if (subobj == Py_None) + return rsplit_whitespace(s, len, maxsplit); + if (PyString_Check(subobj)) { + sub = PyString_AS_STRING(subobj); + n = PyString_GET_SIZE(subobj); + } +#ifdef Py_USING_UNICODE + else if (PyUnicode_Check(subobj)) + return PyUnicode_RSplit((PyObject *)self, subobj, maxsplit); +#endif + else if (PyObject_AsCharBuffer(subobj, &sub, &n)) + return NULL; + if (n == 0) { + PyErr_SetString(PyExc_ValueError, "empty separator"); + return NULL; + } + + list = PyList_New(0); + if (list == NULL) + return NULL; + + j = len; + i = j - n; + while (i >= 0) { + if (s[i] == sub[0] && memcmp(s+i, sub, n) == 0) { + if (maxsplit-- <= 0) + break; + item = PyString_FromStringAndSize(s+i+n, (int)(j-i-n)); + if (item == NULL) + goto fail; + err = PyList_Insert(list, 0, item); + Py_DECREF(item); + if (err < 0) + goto fail; + j = i; + i -= n; + } + else + i--; + } + item = PyString_FromStringAndSize(s, j); + if (item == NULL) + goto fail; + err = PyList_Insert(list, 0, item); + Py_DECREF(item); + if (err < 0) + goto fail; + + return list; + + fail: + Py_DECREF(list); + return NULL; +} + PyDoc_STRVAR(join__doc__, "S.join(sequence) -> string\n\ @@ -3064,6 +3187,7 @@ string_methods[] = { string.maketrans(). */ {"join", (PyCFunction)string_join, METH_O, join__doc__}, {"split", (PyCFunction)string_split, METH_VARARGS, split__doc__}, + {"rsplit", (PyCFunction)string_rsplit, METH_VARARGS, rsplit__doc__}, {"lower", (PyCFunction)string_lower, METH_NOARGS, lower__doc__}, {"upper", (PyCFunction)string_upper, METH_NOARGS, upper__doc__}, {"islower", (PyCFunction)string_islower, METH_NOARGS, islower__doc__}, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index f87d749..19d0353 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -4053,7 +4053,7 @@ PyUnicodeObject *pad(PyUnicodeObject *self, } #define SPLIT_APPEND(data, left, right) \ - str = PyUnicode_FromUnicode(data + left, right - left); \ + str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \ if (!str) \ goto onError; \ if (PyList_Append(list, str)) { \ @@ -4063,6 +4063,17 @@ PyUnicodeObject *pad(PyUnicodeObject *self, else \ Py_DECREF(str); +#define SPLIT_INSERT(data, left, right) \ + str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \ + if (!str) \ + goto onError; \ + if (PyList_Insert(list, 0, str)) { \ + Py_DECREF(str); \ + goto onError; \ + } \ + else \ + Py_DECREF(str); + static PyObject *split_whitespace(PyUnicodeObject *self, PyObject *list, @@ -4214,7 +4225,106 @@ PyObject *split_substring(PyUnicodeObject *self, return NULL; } +static +PyObject *rsplit_whitespace(PyUnicodeObject *self, + PyObject *list, + int maxcount) +{ + register int i; + register int j; + int len = self->length; + PyObject *str; + + for (i = j = len - 1; i >= 0; ) { + /* find a token */ + while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i])) + i--; + j = i; + while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i])) + i--; + if (j > i) { + if (maxcount-- <= 0) + break; + SPLIT_INSERT(self->str, i + 1, j + 1); + while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i])) + i--; + j = i; + } + } + if (j >= 0) { + SPLIT_INSERT(self->str, 0, j + 1); + } + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +static +PyObject *rsplit_char(PyUnicodeObject *self, + PyObject *list, + Py_UNICODE ch, + int maxcount) +{ + register int i; + register int j; + int len = self->length; + PyObject *str; + + for (i = j = len - 1; i >= 0; ) { + if (self->str[i] == ch) { + if (maxcount-- <= 0) + break; + SPLIT_INSERT(self->str, i + 1, j + 1); + j = i = i - 1; + } else + i--; + } + if (j >= 0) { + SPLIT_INSERT(self->str, 0, j + 1); + } + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +static +PyObject *rsplit_substring(PyUnicodeObject *self, + PyObject *list, + PyUnicodeObject *substring, + int maxcount) +{ + register int i; + register int j; + int len = self->length; + int sublen = substring->length; + PyObject *str; + + for (i = len - sublen, j = len; i >= 0; ) { + if (Py_UNICODE_MATCH(self, i, substring)) { + if (maxcount-- <= 0) + break; + SPLIT_INSERT(self->str, i + sublen, j); + j = i; + i -= sublen; + } else + i--; + } + if (j >= 0) { + SPLIT_INSERT(self->str, 0, j); + } + return list; + + onError: + Py_DECREF(list); + return NULL; +} + #undef SPLIT_APPEND +#undef SPLIT_INSERT static PyObject *split(PyUnicodeObject *self, @@ -4246,6 +4356,35 @@ PyObject *split(PyUnicodeObject *self, } static +PyObject *rsplit(PyUnicodeObject *self, + PyUnicodeObject *substring, + int maxcount) +{ + PyObject *list; + + if (maxcount < 0) + maxcount = INT_MAX; + + list = PyList_New(0); + if (!list) + return NULL; + + if (substring == NULL) + return rsplit_whitespace(self,list,maxcount); + + else if (substring->length == 1) + return rsplit_char(self,list,substring->str[0],maxcount); + + else if (substring->length == 0) { + Py_DECREF(list); + PyErr_SetString(PyExc_ValueError, "empty separator"); + return NULL; + } + else + return rsplit_substring(self,list,substring,maxcount); +} + +static PyObject *replace(PyUnicodeObject *self, PyUnicodeObject *str1, PyUnicodeObject *str2, @@ -5675,6 +5814,56 @@ unicode_split(PyUnicodeObject *self, PyObject *args) return PyUnicode_Split((PyObject *)self, substring, maxcount); } +PyObject *PyUnicode_RSplit(PyObject *s, + PyObject *sep, + int maxsplit) +{ + PyObject *result; + + s = PyUnicode_FromObject(s); + if (s == NULL) + return NULL; + if (sep != NULL) { + sep = PyUnicode_FromObject(sep); + if (sep == NULL) { + Py_DECREF(s); + return NULL; + } + } + + result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); + + Py_DECREF(s); + Py_XDECREF(sep); + return result; +} + +PyDoc_STRVAR(rsplit__doc__, +"S.rsplit([sep [,maxsplit]]) -> list of strings\n\ +\n\ +Return a list of the words in S, using sep as the\n\ +delimiter string, starting at the end of the string and\n\ +working to the front. If maxsplit is given, at most maxsplit\n\ +splits are done. If sep is not specified, any whitespace string\n\ +is a separator."); + +static PyObject* +unicode_rsplit(PyUnicodeObject *self, PyObject *args) +{ + PyObject *substring = Py_None; + int maxcount = -1; + + if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount)) + return NULL; + + if (substring == Py_None) + return rsplit(self, NULL, maxcount); + else if (PyUnicode_Check(substring)) + return rsplit(self, (PyUnicodeObject *)substring, maxcount); + else + return PyUnicode_RSplit((PyObject *)self, substring, maxcount); +} + PyDoc_STRVAR(splitlines__doc__, "S.splitlines([keepends]]) -> list of strings\n\ \n\ @@ -5870,6 +6059,7 @@ static PyMethodDef unicode_methods[] = { {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, + {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, -- cgit v0.12