diff options
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r-- | Objects/unicodeobject.c | 772 |
1 files changed, 455 insertions, 317 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index e62c774..6cdb0fc 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -4,6 +4,9 @@ Unicode implementation based on original code by Fredrik Lundh, modified by Marc-Andre Lemburg <mal@lemburg.com> according to the Unicode Integration Proposal (see file Misc/unicode.txt). +Major speed upgrades to the method implementations at the Reykjavik +NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. + Copyright (c) Corporation for National Research Initiatives. -------------------------------------------------------------------- @@ -121,6 +124,51 @@ PyUnicode_GetMax(void) #endif } +/* --- Bloom Filters ----------------------------------------------------- */ + +/* stuff to implement simple "bloom filters" for Unicode characters. + to keep things simple, we use a single bitmask, using the least 5 + bits from each unicode characters as the bit index. */ + +/* the linebreak mask is set up by Unicode_Init below */ + +#define BLOOM_MASK unsigned long + +static BLOOM_MASK bloom_linebreak; + +#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F)))) + +#define BLOOM_LINEBREAK(ch)\ + (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch))) + +Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) +{ + /* calculate simple bloom-style bitmask for a given unicode string */ + + long mask; + Py_ssize_t i; + + mask = 0; + for (i = 0; i < len; i++) + mask |= (1 << (ptr[i] & 0x1F)); + + return mask; +} + +Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen) +{ + Py_ssize_t i; + + for (i = 0; i < setlen; i++) + if (set[i] == chr) + return 1; + + return 0; +} + +#define BLOOM_MEMBER(mask, chr, set, setlen)\ + BLOOM(mask, chr) && unicode_member(chr, set, setlen) + /* --- Unicode Object ----------------------------------------------------- */ static @@ -136,6 +184,7 @@ int unicode_resize(register PyUnicodeObject *unicode, /* Resizing shared object (unicode_empty or single character objects) in-place is not allowed. Use PyUnicode_Resize() instead ! */ + if (unicode == unicode_empty || (unicode->length == 1 && unicode->str[0] < 256U && @@ -145,8 +194,11 @@ int unicode_resize(register PyUnicodeObject *unicode, return -1; } - /* We allocate one more byte to make sure the string is - Ux0000 terminated -- XXX is this needed ? */ + /* We allocate one more byte to make sure the string is Ux0000 terminated. + The overallocation is also used by fastsearch, which assumes that it's + safe to look at str[length] (without making any assumptions about what + it contains). */ + oldstr = unicode->str; PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1); if (!unicode->str) { @@ -181,7 +233,7 @@ PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) { register PyUnicodeObject *unicode; - /* Optimization fo empty strings */ + /* Optimization for empty strings */ if (length == 0 && unicode_empty != NULL) { Py_INCREF(unicode_empty); return unicode_empty; @@ -1963,9 +2015,20 @@ onError: */ -static const Py_UNICODE *findchar(const Py_UNICODE *s, - Py_ssize_t size, - Py_UNICODE ch); +Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s, + Py_ssize_t size, + Py_UNICODE ch) +{ + /* like wcschr, but doesn't stop at NULL characters */ + + while (size-- > 0) { + if (*s == ch) + return s; + s++; + } + + return NULL; +} static PyObject *unicodeescape_string(const Py_UNICODE *s, @@ -2313,7 +2376,7 @@ PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, end = s + size; while (s < end) { - *p = *(Py_UNICODE *)s; + memcpy(p, s, sizeof(Py_UNICODE)); /* We have to sanity check the raw data, otherwise doom looms for some malformed UCS-4 data. */ if ( @@ -3791,124 +3854,104 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s, /* --- Helpers ------------------------------------------------------------ */ -static -Py_ssize_t count(PyUnicodeObject *self, - Py_ssize_t start, - Py_ssize_t end, - PyUnicodeObject *substring) -{ - Py_ssize_t count = 0; +#define STRINGLIB_CHAR Py_UNICODE - if (start < 0) - start += self->length; - if (start < 0) - start = 0; - if (end > self->length) - end = self->length; - if (end < 0) - end += self->length; - if (end < 0) - end = 0; +#define STRINGLIB_LEN PyUnicode_GET_SIZE +#define STRINGLIB_NEW PyUnicode_FromUnicode +#define STRINGLIB_STR PyUnicode_AS_UNICODE - if (substring->length == 0) - return (end - start + 1); +Py_LOCAL_INLINE(int) +STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len) +{ + if (str[0] != other[0]) + return 1; + return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE)); +} - end -= substring->length; +#define STRINGLIB_EMPTY unicode_empty - while (start <= end) - if (Py_UNICODE_MATCH(self, start, substring)) { - count++; - start += substring->length; - } else - start++; +#include "stringlib/fastsearch.h" - return count; -} +#include "stringlib/count.h" +#include "stringlib/find.h" +#include "stringlib/partition.h" + +/* helper macro to fixup start/end slice values */ +#define FIX_START_END(obj) \ + if (start < 0) \ + start += (obj)->length; \ + if (start < 0) \ + start = 0; \ + if (end > (obj)->length) \ + end = (obj)->length; \ + if (end < 0) \ + end += (obj)->length; \ + if (end < 0) \ + end = 0; Py_ssize_t PyUnicode_Count(PyObject *str, - PyObject *substr, - Py_ssize_t start, - Py_ssize_t end) + PyObject *substr, + Py_ssize_t start, + Py_ssize_t end) { Py_ssize_t result; + PyUnicodeObject* str_obj; + PyUnicodeObject* sub_obj; - str = PyUnicode_FromObject(str); - if (str == NULL) + str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); + if (!str_obj) return -1; - substr = PyUnicode_FromObject(substr); - if (substr == NULL) { - Py_DECREF(str); + sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); + if (!sub_obj) { + Py_DECREF(str_obj); return -1; } - result = count((PyUnicodeObject *)str, - start, end, - (PyUnicodeObject *)substr); - - Py_DECREF(str); - Py_DECREF(substr); - return result; -} - -static -Py_ssize_t findstring(PyUnicodeObject *self, - PyUnicodeObject *substring, - Py_ssize_t start, - Py_ssize_t end, - int direction) -{ - if (start < 0) - start += self->length; - if (start < 0) - start = 0; - - if (end > self->length) - end = self->length; - if (end < 0) - end += self->length; - if (end < 0) - end = 0; + FIX_START_END(str_obj); - if (substring->length == 0) - return (direction > 0) ? start : end; - - end -= substring->length; + result = stringlib_count( + str_obj->str + start, end - start, sub_obj->str, sub_obj->length + ); - if (direction < 0) { - for (; end >= start; end--) - if (Py_UNICODE_MATCH(self, end, substring)) - return end; - } else { - for (; start <= end; start++) - if (Py_UNICODE_MATCH(self, start, substring)) - return start; - } + Py_DECREF(sub_obj); + Py_DECREF(str_obj); - return -1; + return result; } Py_ssize_t PyUnicode_Find(PyObject *str, - PyObject *substr, - Py_ssize_t start, - Py_ssize_t end, - int direction) + PyObject *sub, + Py_ssize_t start, + Py_ssize_t end, + int direction) { Py_ssize_t result; str = PyUnicode_FromObject(str); - if (str == NULL) + if (!str) return -2; - substr = PyUnicode_FromObject(substr); - if (substr == NULL) { + sub = PyUnicode_FromObject(sub); + if (!sub) { Py_DECREF(str); return -2; } - result = findstring((PyUnicodeObject *)str, - (PyUnicodeObject *)substr, - start, end, direction); + if (direction > 0) + result = stringlib_find_slice( + PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), + PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), + start, end + ); + else + result = stringlib_rfind_slice( + PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), + PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), + start, end + ); + Py_DECREF(str); - Py_DECREF(substr); + Py_DECREF(sub); + return result; } @@ -3919,20 +3962,10 @@ int tailmatch(PyUnicodeObject *self, Py_ssize_t end, int direction) { - if (start < 0) - start += self->length; - if (start < 0) - start = 0; - if (substring->length == 0) return 1; - if (end > self->length) - end = self->length; - if (end < 0) - end += self->length; - if (end < 0) - end = 0; + FIX_START_END(self); end -= substring->length; if (end < start) @@ -3974,22 +4007,6 @@ Py_ssize_t PyUnicode_Tailmatch(PyObject *str, return result; } -static -const Py_UNICODE *findchar(const Py_UNICODE *s, - Py_ssize_t size, - Py_UNICODE ch) -{ - /* like wcschr, but doesn't stop at NULL characters */ - - while (size-- > 0) { - if (*s == ch) - return s; - s++; - } - - return NULL; -} - /* Apply fixfct filter to the Unicode object self and return a reference to the modified object */ @@ -4148,10 +4165,10 @@ PyUnicode_Join(PyObject *separator, PyObject *seq) PyObject *internal_separator = NULL; const Py_UNICODE blank = ' '; const Py_UNICODE *sep = ␣ - size_t seplen = 1; + Py_ssize_t seplen = 1; PyUnicodeObject *res = NULL; /* the result */ - size_t res_alloc = 100; /* # allocated bytes for string in res */ - size_t res_used; /* # used bytes */ + Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */ + Py_ssize_t res_used; /* # used bytes */ Py_UNICODE *res_p; /* pointer to free byte in res's string area */ PyObject *fseq; /* PySequence_Fast(seq) */ Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ @@ -4212,8 +4229,8 @@ PyUnicode_Join(PyObject *separator, PyObject *seq) res_used = 0; for (i = 0; i < seqlen; ++i) { - size_t itemlen; - size_t new_res_used; + Py_ssize_t itemlen; + Py_ssize_t new_res_used; item = PySequence_Fast_GET_ITEM(fseq, i); /* Convert item to Unicode. */ @@ -4235,19 +4252,18 @@ PyUnicode_Join(PyObject *separator, PyObject *seq) /* Make sure we have enough space for the separator and the item. */ itemlen = PyUnicode_GET_SIZE(item); new_res_used = res_used + itemlen; - if (new_res_used < res_used || new_res_used > PY_SSIZE_T_MAX) + if (new_res_used <= 0) goto Overflow; if (i < seqlen - 1) { new_res_used += seplen; - if (new_res_used < res_used || new_res_used > PY_SSIZE_T_MAX) + if (new_res_used <= 0) goto Overflow; } if (new_res_used > res_alloc) { /* double allocated size until it's big enough */ do { - size_t oldsize = res_alloc; res_alloc += res_alloc; - if (res_alloc < oldsize || res_alloc > PY_SSIZE_T_MAX) + if (res_alloc <= 0) goto Overflow; } while (new_res_used > res_alloc); if (_PyUnicode_Resize(&res, res_alloc) < 0) { @@ -4333,17 +4349,6 @@ PyUnicodeObject *pad(PyUnicodeObject *self, else \ Py_DECREF(str); -#define SPLIT_INSERT(data, left, right) \ - str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \ - if (!str) \ - goto onError; \ - if (PyList_Insert(list, 0, str)) { \ - Py_DECREF(str); \ - goto onError; \ - } \ - else \ - Py_DECREF(str); - static PyObject *split_whitespace(PyUnicodeObject *self, PyObject *list, @@ -4404,7 +4409,7 @@ PyObject *PyUnicode_Splitlines(PyObject *string, Py_ssize_t eol; /* Find a line and append it */ - while (i < len && !Py_UNICODE_ISLINEBREAK(data[i])) + while (i < len && !BLOOM_LINEBREAK(data[i])) i++; /* Skip the line break reading CRLF as one line break */ @@ -4515,15 +4520,17 @@ PyObject *rsplit_whitespace(PyUnicodeObject *self, if (j > i) { if (maxcount-- <= 0) break; - SPLIT_INSERT(self->str, i + 1, j + 1); + SPLIT_APPEND(self->str, i + 1, j + 1); while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i])) i--; j = i; } } if (j >= 0) { - SPLIT_INSERT(self->str, 0, j + 1); + SPLIT_APPEND(self->str, 0, j + 1); } + if (PyList_Reverse(list) < 0) + goto onError; return list; onError: @@ -4546,14 +4553,16 @@ PyObject *rsplit_char(PyUnicodeObject *self, if (self->str[i] == ch) { if (maxcount-- <= 0) break; - SPLIT_INSERT(self->str, i + 1, j + 1); + SPLIT_APPEND(self->str, i + 1, j + 1); j = i = i - 1; } else i--; } if (j >= -1) { - SPLIT_INSERT(self->str, 0, j + 1); + SPLIT_APPEND(self->str, 0, j + 1); } + if (PyList_Reverse(list) < 0) + goto onError; return list; onError: @@ -4577,15 +4586,17 @@ PyObject *rsplit_substring(PyUnicodeObject *self, if (Py_UNICODE_MATCH(self, i, substring)) { if (maxcount-- <= 0) break; - SPLIT_INSERT(self->str, i + sublen, j); + SPLIT_APPEND(self->str, i + sublen, j); j = i; i -= sublen; } else i--; } if (j >= 0) { - SPLIT_INSERT(self->str, 0, j); + SPLIT_APPEND(self->str, 0, j); } + if (PyList_Reverse(list) < 0) + goto onError; return list; onError: @@ -4594,7 +4605,6 @@ PyObject *rsplit_substring(PyUnicodeObject *self, } #undef SPLIT_APPEND -#undef SPLIT_INSERT static PyObject *split(PyUnicodeObject *self, @@ -4665,88 +4675,128 @@ PyObject *replace(PyUnicodeObject *self, if (maxcount < 0) maxcount = PY_SSIZE_T_MAX; - if (str1->length == 1 && str2->length == 1) { + if (str1->length == str2->length) { + /* same length */ Py_ssize_t i; - - /* replace characters */ - if (!findchar(self->str, self->length, str1->str[0]) && - PyUnicode_CheckExact(self)) { - /* nothing to replace, return original string */ - Py_INCREF(self); - u = self; + if (str1->length == 1) { + /* replace characters */ + Py_UNICODE u1, u2; + if (!findchar(self->str, self->length, str1->str[0])) + goto nothing; + u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); + if (!u) + return NULL; + Py_UNICODE_COPY(u->str, self->str, self->length); + u1 = str1->str[0]; + u2 = str2->str[0]; + for (i = 0; i < u->length; i++) + if (u->str[i] == u1) { + if (--maxcount < 0) + break; + u->str[i] = u2; + } } else { - Py_UNICODE u1 = str1->str[0]; - Py_UNICODE u2 = str2->str[0]; - - u = (PyUnicodeObject*) PyUnicode_FromUnicode( - NULL, - self->length + i = fastsearch( + self->str, self->length, str1->str, str1->length, FAST_SEARCH ); - if (u != NULL) { - Py_UNICODE_COPY(u->str, self->str, - self->length); - for (i = 0; i < u->length; i++) - if (u->str[i] == u1) { - if (--maxcount < 0) - break; - u->str[i] = u2; - } - } + if (i < 0) + goto nothing; + u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); + if (!u) + return NULL; + Py_UNICODE_COPY(u->str, self->str, self->length); + while (i <= self->length - str1->length) + if (Py_UNICODE_MATCH(self, i, str1)) { + if (--maxcount < 0) + break; + Py_UNICODE_COPY(u->str+i, str2->str, str2->length); + i += str1->length; + } else + i++; } - } else { - Py_ssize_t n, i; + + Py_ssize_t n, i, j, e; + Py_ssize_t product, new_size, delta; Py_UNICODE *p; /* replace strings */ - n = count(self, 0, self->length, str1); + n = stringlib_count(self->str, self->length, str1->str, str1->length); if (n > maxcount) n = maxcount; - if (n == 0) { - /* nothing to replace, return original string */ - if (PyUnicode_CheckExact(self)) { - Py_INCREF(self); - u = self; - } - else { - u = (PyUnicodeObject *) - PyUnicode_FromUnicode(self->str, self->length); - } + if (n == 0) + goto nothing; + /* new_size = self->length + n * (str2->length - str1->length)); */ + delta = (str2->length - str1->length); + if (delta == 0) { + new_size = self->length; } else { - u = _PyUnicode_New( - self->length + n * (str2->length - str1->length)); - if (u) { - i = 0; - p = u->str; - if (str1->length > 0) { - while (i <= self->length - str1->length) - if (Py_UNICODE_MATCH(self, i, str1)) { - /* replace string segment */ - Py_UNICODE_COPY(p, str2->str, str2->length); - p += str2->length; - i += str1->length; - if (--n <= 0) { - /* copy remaining part */ - Py_UNICODE_COPY(p, self->str+i, self->length-i); - break; - } - } else - *p++ = self->str[i++]; - } else { - while (n > 0) { - Py_UNICODE_COPY(p, str2->str, str2->length); - p += str2->length; - if (--n <= 0) - break; - *p++ = self->str[i++]; - } - Py_UNICODE_COPY(p, self->str+i, self->length-i); + product = n * (str2->length - str1->length); + if ((product / (str2->length - str1->length)) != n) { + PyErr_SetString(PyExc_OverflowError, + "replace string is too long"); + return NULL; + } + new_size = self->length + product; + if (new_size < 0) { + PyErr_SetString(PyExc_OverflowError, + "replace string is too long"); + return NULL; + } + } + u = _PyUnicode_New(new_size); + if (!u) + return NULL; + i = 0; + p = u->str; + e = self->length - str1->length; + if (str1->length > 0) { + while (n-- > 0) { + /* look for next match */ + j = i; + while (j <= e) { + if (Py_UNICODE_MATCH(self, j, str1)) + break; + j++; + } + if (j > i) { + if (j > e) + break; + /* copy unchanged part [i:j] */ + Py_UNICODE_COPY(p, self->str+i, j-i); + p += j - i; + } + /* copy substitution string */ + if (str2->length > 0) { + Py_UNICODE_COPY(p, str2->str, str2->length); + p += str2->length; } + i = j + str1->length; + } + if (i < self->length) + /* copy tail [i:] */ + Py_UNICODE_COPY(p, self->str+i, self->length-i); + } else { + /* interleave */ + while (n > 0) { + Py_UNICODE_COPY(p, str2->str, str2->length); + p += str2->length; + if (--n <= 0) + break; + *p++ = self->str[i++]; } + Py_UNICODE_COPY(p, self->str+i, self->length-i); } } - return (PyObject *) u; + +nothing: + /* nothing to replace; return original string (when possible) */ + if (PyUnicode_CheckExact(self)) { + Py_INCREF(self); + return (PyObject *) self; + } + return PyUnicode_FromUnicode(self->str, self->length); } /* --- Unicode Object Methods --------------------------------------------- */ @@ -4983,54 +5033,29 @@ onError: int PyUnicode_Contains(PyObject *container, PyObject *element) { - PyUnicodeObject *u = NULL, *v = NULL; + PyObject *str, *sub; int result; - Py_ssize_t size; - register const Py_UNICODE *lhs, *end, *rhs; /* Coerce the two arguments */ - v = (PyUnicodeObject *)PyUnicode_FromObject(element); - if (v == NULL) { + sub = PyUnicode_FromObject(element); + if (!sub) { PyErr_SetString(PyExc_TypeError, "'in <string>' requires string as left operand"); - goto onError; + return -1; } - u = (PyUnicodeObject *)PyUnicode_FromObject(container); - if (u == NULL) - goto onError; - - size = PyUnicode_GET_SIZE(v); - rhs = PyUnicode_AS_UNICODE(v); - lhs = PyUnicode_AS_UNICODE(u); - result = 0; - if (size == 1) { - end = lhs + PyUnicode_GET_SIZE(u); - while (lhs < end) { - if (*lhs++ == *rhs) { - result = 1; - break; - } - } - } - else { - end = lhs + (PyUnicode_GET_SIZE(u) - size); - while (lhs <= end) { - if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) { - result = 1; - break; - } - } + str = PyUnicode_FromObject(container); + if (!str) { + Py_DECREF(sub); + return -1; } - Py_DECREF(u); - Py_DECREF(v); - return result; + result = stringlib_contains_obj(str, sub); -onError: - Py_XDECREF(u); - Py_XDECREF(v); - return -1; + Py_DECREF(str); + Py_DECREF(sub); + + return result; } /* Concat to string or Unicode object giving a new Unicode object. */ @@ -5078,8 +5103,8 @@ onError: PyDoc_STRVAR(count__doc__, "S.count(sub[, start[, end]]) -> int\n\ \n\ -Return the number of occurrences of substring sub in Unicode string\n\ -S[start:end]. Optional arguments start and end are\n\ +Return the number of non-overlapping occurrences of substring sub in\n\ +Unicode string S[start:end]. Optional arguments start and end are\n\ interpreted as in slice notation."); static PyObject * @@ -5095,24 +5120,19 @@ unicode_count(PyUnicodeObject *self, PyObject *args) return NULL; substring = (PyUnicodeObject *)PyUnicode_FromObject( - (PyObject *)substring); + (PyObject *)substring); if (substring == NULL) return NULL; - if (start < 0) - start += self->length; - if (start < 0) - start = 0; - if (end > self->length) - end = self->length; - if (end < 0) - end += self->length; - if (end < 0) - end = 0; + FIX_START_END(self); - result = PyInt_FromLong((long) count(self, start, end, substring)); + result = PyInt_FromSsize_t( + stringlib_count(self->str + start, end - start, + substring->str, substring->length) + ); Py_DECREF(substring); + return result; } @@ -5262,23 +5282,27 @@ Return -1 on failure."); static PyObject * unicode_find(PyUnicodeObject *self, PyObject *args) { - PyUnicodeObject *substring; + PyObject *substring; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; - PyObject *result; + Py_ssize_t result; if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring, _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) return NULL; - substring = (PyUnicodeObject *)PyUnicode_FromObject( - (PyObject *)substring); - if (substring == NULL) + substring = PyUnicode_FromObject(substring); + if (!substring) return NULL; - result = PyInt_FromSsize_t(findstring(self, substring, start, end, 1)); + result = stringlib_find_slice( + PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), + PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), + start, end + ); Py_DECREF(substring); - return result; + + return PyInt_FromSsize_t(result); } static PyObject * @@ -5328,26 +5352,30 @@ static PyObject * unicode_index(PyUnicodeObject *self, PyObject *args) { Py_ssize_t result; - PyUnicodeObject *substring; + PyObject *substring; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring, _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) return NULL; - - substring = (PyUnicodeObject *)PyUnicode_FromObject( - (PyObject *)substring); - if (substring == NULL) + substring = PyUnicode_FromObject(substring); + if (!substring) return NULL; - result = findstring(self, substring, start, end, 1); + result = stringlib_find_slice( + PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), + PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), + start, end + ); Py_DECREF(substring); + if (result < 0) { PyErr_SetString(PyExc_ValueError, "substring not found"); return NULL; } + return PyInt_FromSsize_t(result); } @@ -5702,16 +5730,6 @@ static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; #define STRIPNAME(i) (stripformat[i]+3) -static const Py_UNICODE * -unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n) -{ - size_t i; - for (i = 0; i < n; ++i) - if (s[i] == c) - return s+i; - return NULL; -} - /* externally visible for str.strip(unicode) */ PyObject * _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) @@ -5722,27 +5740,29 @@ _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj); Py_ssize_t i, j; + BLOOM_MASK sepmask = make_bloom_mask(sep, seplen); + i = 0; if (striptype != RIGHTSTRIP) { - while (i < len && unicode_memchr(sep, s[i], seplen)) { - i++; - } + while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) { + i++; + } } j = len; if (striptype != LEFTSTRIP) { - do { - j--; - } while (j >= i && unicode_memchr(sep, s[j], seplen)); - j++; + do { + j--; + } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen)); + j++; } if (i == 0 && j == len && PyUnicode_CheckExact(self)) { - Py_INCREF(self); - return (PyObject*)self; + Py_INCREF(self); + return (PyObject*)self; } else - return PyUnicode_FromUnicode(s+i, j-i); + return PyUnicode_FromUnicode(s+i, j-i); } @@ -5898,9 +5918,19 @@ unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) p = u->str; - while (len-- > 0) { - Py_UNICODE_COPY(p, str->str, str->length); - p += str->length; + if (str->length == 1 && len > 0) { + Py_UNICODE_FILL(p, str->str[0], len); + } else { + Py_ssize_t done = 0; /* number of characters copied this far */ + if (done < nchars) { + Py_UNICODE_COPY(p, str->str, str->length); + done = str->length; + } + while (done < nchars) { + int n = (done <= nchars-done) ? done : nchars-done; + Py_UNICODE_COPY(p+done, p, n); + done += n; + } } return (PyObject*) u; @@ -5993,23 +6023,27 @@ Return -1 on failure."); static PyObject * unicode_rfind(PyUnicodeObject *self, PyObject *args) { - PyUnicodeObject *substring; + PyObject *substring; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; - PyObject *result; + Py_ssize_t result; if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring, _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) return NULL; - substring = (PyUnicodeObject *)PyUnicode_FromObject( - (PyObject *)substring); - if (substring == NULL) + substring = PyUnicode_FromObject(substring); + if (!substring) return NULL; - result = PyInt_FromSsize_t(findstring(self, substring, start, end, -1)); + result = stringlib_rfind_slice( + PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), + PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), + start, end + ); Py_DECREF(substring); - return result; + + return PyInt_FromSsize_t(result); } PyDoc_STRVAR(rindex__doc__, @@ -6020,22 +6054,26 @@ Like S.rfind() but raise ValueError when the substring is not found."); static PyObject * unicode_rindex(PyUnicodeObject *self, PyObject *args) { - Py_ssize_t result; - PyUnicodeObject *substring; + PyObject *substring; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; + Py_ssize_t result; if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring, _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) return NULL; - substring = (PyUnicodeObject *)PyUnicode_FromObject( - (PyObject *)substring); - if (substring == NULL) + substring = PyUnicode_FromObject(substring); + if (!substring) return NULL; - result = findstring(self, substring, start, end, -1); + result = stringlib_rfind_slice( + PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), + PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), + start, end + ); Py_DECREF(substring); + if (result < 0) { PyErr_SetString(PyExc_ValueError, "substring not found"); return NULL; @@ -6137,6 +6175,87 @@ unicode_split(PyUnicodeObject *self, PyObject *args) return PyUnicode_Split((PyObject *)self, substring, maxcount); } +PyObject * +PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) +{ + PyObject* str_obj; + PyObject* sep_obj; + PyObject* out; + + str_obj = PyUnicode_FromObject(str_in); + if (!str_obj) + return NULL; + sep_obj = PyUnicode_FromObject(sep_in); + if (!sep_obj) { + Py_DECREF(str_obj); + return NULL; + } + + out = stringlib_partition( + str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), + sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) + ); + + Py_DECREF(sep_obj); + Py_DECREF(str_obj); + + return out; +} + + +PyObject * +PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) +{ + PyObject* str_obj; + PyObject* sep_obj; + PyObject* out; + + str_obj = PyUnicode_FromObject(str_in); + if (!str_obj) + return NULL; + sep_obj = PyUnicode_FromObject(sep_in); + if (!sep_obj) { + Py_DECREF(str_obj); + return NULL; + } + + out = stringlib_rpartition( + str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), + sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) + ); + + Py_DECREF(sep_obj); + Py_DECREF(str_obj); + + return out; +} + +PyDoc_STRVAR(partition__doc__, +"S.partition(sep) -> (head, sep, tail)\n\ +\n\ +Searches for the separator sep in S, and returns the part before it,\n\ +the separator itself, and the part after it. If the separator is not\n\ +found, returns S and two empty strings."); + +static PyObject* +unicode_partition(PyUnicodeObject *self, PyObject *separator) +{ + return PyUnicode_Partition((PyObject *)self, separator); +} + +PyDoc_STRVAR(rpartition__doc__, +"S.rpartition(sep) -> (head, sep, tail)\n\ +\n\ +Searches for the separator sep in S, starting at the end of S, and returns\n\ +the part before it, the separator itself, and the part after it. If the\n\ +separator is not found, returns S and two empty strings."); + +static PyObject* +unicode_rpartition(PyUnicodeObject *self, PyObject *separator) +{ + return PyUnicode_RPartition((PyObject *)self, separator); +} + PyObject *PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) @@ -6390,6 +6509,7 @@ static PyMethodDef unicode_methods[] = { {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, + {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, @@ -6400,6 +6520,7 @@ static PyMethodDef unicode_methods[] = { {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, + {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, @@ -7375,6 +7496,18 @@ void _PyUnicode_Init(void) { int i; + /* XXX - move this array to unicodectype.c ? */ + Py_UNICODE linebreak[] = { + 0x000A, /* LINE FEED */ + 0x000D, /* CARRIAGE RETURN */ + 0x001C, /* FILE SEPARATOR */ + 0x001D, /* GROUP SEPARATOR */ + 0x001E, /* RECORD SEPARATOR */ + 0x0085, /* NEXT LINE */ + 0x2028, /* LINE SEPARATOR */ + 0x2029, /* PARAGRAPH SEPARATOR */ + }; + /* Init the implementation */ unicode_freelist = NULL; unicode_freelist_size = 0; @@ -7384,6 +7517,11 @@ void _PyUnicode_Init(void) unicode_latin1[i] = NULL; if (PyType_Ready(&PyUnicode_Type) < 0) Py_FatalError("Can't initialize 'unicode'"); + + /* initialize the linebreak bloom filter */ + bloom_linebreak = make_bloom_mask( + linebreak, sizeof(linebreak) / sizeof(linebreak[0]) + ); } /* Finalize the Unicode implementation */ |