diff options
Diffstat (limited to 'Objects/stringlib')
-rw-r--r-- | Objects/stringlib/README.txt | 4 | ||||
-rw-r--r-- | Objects/stringlib/count.h | 16 | ||||
-rw-r--r-- | Objects/stringlib/ctype.h | 1 | ||||
-rw-r--r-- | Objects/stringlib/fastsearch.h | 148 | ||||
-rw-r--r-- | Objects/stringlib/find.h | 86 | ||||
-rw-r--r-- | Objects/stringlib/formatter.h | 92 | ||||
-rw-r--r-- | Objects/stringlib/partition.h | 47 | ||||
-rw-r--r-- | Objects/stringlib/split.h | 394 | ||||
-rw-r--r-- | Objects/stringlib/string_format.h | 26 | ||||
-rw-r--r-- | Objects/stringlib/stringdefs.h | 4 | ||||
-rw-r--r-- | Objects/stringlib/transmogrify.h | 113 | ||||
-rw-r--r-- | Objects/stringlib/unicodedefs.h | 22 |
12 files changed, 636 insertions, 317 deletions
diff --git a/Objects/stringlib/README.txt b/Objects/stringlib/README.txt index 60d919e..ab506d6 100644 --- a/Objects/stringlib/README.txt +++ b/Objects/stringlib/README.txt @@ -16,10 +16,6 @@ STRINGLIB_EMPTY a PyObject representing the empty string, only to be used if STRINGLIB_MUTABLE is 0 -int STRINGLIB_CMP(STRINGLIB_CHAR*, STRINGLIB_CHAR*, Py_ssize_t) - - compares two strings. returns 0 if they match, and non-zero if not. - Py_ssize_t STRINGLIB_LEN(PyObject*) returns the length of the given string object (which must be of the diff --git a/Objects/stringlib/count.h b/Objects/stringlib/count.h index eba37e9..de34f96 100644 --- a/Objects/stringlib/count.h +++ b/Objects/stringlib/count.h @@ -9,28 +9,22 @@ Py_LOCAL_INLINE(Py_ssize_t) stringlib_count(const STRINGLIB_CHAR* str, Py_ssize_t str_len, - const STRINGLIB_CHAR* sub, Py_ssize_t sub_len) + const STRINGLIB_CHAR* sub, Py_ssize_t sub_len, + Py_ssize_t maxcount) { Py_ssize_t count; if (str_len < 0) return 0; /* start > len(str) */ if (sub_len == 0) - return str_len + 1; + return (str_len < maxcount) ? str_len + 1 : maxcount; - count = fastsearch(str, str_len, sub, sub_len, FAST_COUNT); + count = fastsearch(str, str_len, sub, sub_len, maxcount, FAST_COUNT); if (count < 0) - count = 0; /* no match */ + return 0; /* no match */ return count; } #endif - -/* -Local variables: -c-basic-offset: 4 -indent-tabs-mode: nil -End: -*/ diff --git a/Objects/stringlib/ctype.h b/Objects/stringlib/ctype.h index 8951276..739cf3d 100644 --- a/Objects/stringlib/ctype.h +++ b/Objects/stringlib/ctype.h @@ -107,4 +107,3 @@ stringlib_swapcase(PyObject *self) STRINGLIB_LEN(self)); return newobj; } - diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 23bccfb..e231c58 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -5,7 +5,7 @@ /* fast search/count implementation, based on a mix between boyer- moore and horspool, with a few more bells and whistles on the top. - for some more background, see: http://effbot.org/stringlib.htm */ + for some more background, see: http://effbot.org/zone/stringlib.htm */ /* note: fastsearch may access s[n], which isn't a problem when using Python's ordinary string types, but may cause problems if you're @@ -16,19 +16,35 @@ #define FAST_COUNT 0 #define FAST_SEARCH 1 +#define FAST_RSEARCH 2 + +#if LONG_BIT >= 128 +#define STRINGLIB_BLOOM_WIDTH 128 +#elif LONG_BIT >= 64 +#define STRINGLIB_BLOOM_WIDTH 64 +#elif LONG_BIT >= 32 +#define STRINGLIB_BLOOM_WIDTH 32 +#else +#error "LONG_BIT is smaller than 32" +#endif + +#define STRINGLIB_BLOOM_ADD(mask, ch) \ + ((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) +#define STRINGLIB_BLOOM(mask, ch) \ + ((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) Py_LOCAL_INLINE(Py_ssize_t) fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n, const STRINGLIB_CHAR* p, Py_ssize_t m, - int mode) + Py_ssize_t maxcount, int mode) { - long mask; + unsigned long mask; Py_ssize_t skip, count = 0; Py_ssize_t i, j, mlast, w; w = n - m; - if (w < 0) + if (w < 0 || (mode == FAST_COUNT && maxcount == 0)) return -1; /* look for special cases */ @@ -38,54 +54,101 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n, /* use special case for 1-character strings */ if (mode == FAST_COUNT) { for (i = 0; i < n; i++) - if (s[i] == p[0]) + if (s[i] == p[0]) { count++; + if (count == maxcount) + return maxcount; + } return count; - } else { + } else if (mode == FAST_SEARCH) { for (i = 0; i < n; i++) if (s[i] == p[0]) return i; + } else { /* FAST_RSEARCH */ + for (i = n - 1; i > -1; i--) + if (s[i] == p[0]) + return i; } return -1; } mlast = m - 1; - - /* create compressed boyer-moore delta 1 table */ skip = mlast - 1; - /* process pattern[:-1] */ - for (mask = i = 0; i < mlast; i++) { - mask |= (1 << (p[i] & 0x1F)); - if (p[i] == p[mlast]) - skip = mlast - i - 1; - } - /* process pattern[-1] outside the loop */ - mask |= (1 << (p[mlast] & 0x1F)); - - for (i = 0; i <= w; i++) { - /* note: using mlast in the skip path slows things down on x86 */ - if (s[i+m-1] == p[m-1]) { - /* candidate match */ - for (j = 0; j < mlast; j++) - if (s[i+j] != p[j]) - break; - if (j == mlast) { - /* got a match! */ - if (mode != FAST_COUNT) + mask = 0; + + if (mode != FAST_RSEARCH) { + + /* create compressed boyer-moore delta 1 table */ + + /* process pattern[:-1] */ + for (i = 0; i < mlast; i++) { + STRINGLIB_BLOOM_ADD(mask, p[i]); + if (p[i] == p[mlast]) + skip = mlast - i - 1; + } + /* process pattern[-1] outside the loop */ + STRINGLIB_BLOOM_ADD(mask, p[mlast]); + + for (i = 0; i <= w; i++) { + /* note: using mlast in the skip path slows things down on x86 */ + if (s[i+m-1] == p[m-1]) { + /* candidate match */ + for (j = 0; j < mlast; j++) + if (s[i+j] != p[j]) + break; + if (j == mlast) { + /* got a match! */ + if (mode != FAST_COUNT) + return i; + count++; + if (count == maxcount) + return maxcount; + i = i + mlast; + continue; + } + /* miss: check if next character is part of pattern */ + if (!STRINGLIB_BLOOM(mask, s[i+m])) + i = i + m; + else + i = i + skip; + } else { + /* skip: check if next character is part of pattern */ + if (!STRINGLIB_BLOOM(mask, s[i+m])) + i = i + m; + } + } + } else { /* FAST_RSEARCH */ + + /* create compressed boyer-moore delta 1 table */ + + /* process pattern[0] outside the loop */ + STRINGLIB_BLOOM_ADD(mask, p[0]); + /* process pattern[:0:-1] */ + for (i = mlast; i > 0; i--) { + STRINGLIB_BLOOM_ADD(mask, p[i]); + if (p[i] == p[0]) + skip = i - 1; + } + + for (i = w; i >= 0; i--) { + if (s[i] == p[0]) { + /* candidate match */ + for (j = mlast; j > 0; j--) + if (s[i+j] != p[j]) + break; + if (j == 0) + /* got a match! */ return i; - count++; - i = i + mlast; - continue; + /* miss: check if previous character is part of pattern */ + if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) + i = i - m; + else + i = i - skip; + } else { + /* skip: check if previous character is part of pattern */ + if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) + i = i - m; } - /* miss: check if next character is part of pattern */ - if (!(mask & (1 << (s[i+m] & 0x1F)))) - i = i + m; - else - i = i + skip; - } else { - /* skip: check if next character is part of pattern */ - if (!(mask & (1 << (s[i+m] & 0x1F)))) - i = i + m; } } @@ -95,10 +158,3 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n, } #endif - -/* -Local variables: -c-basic-offset: 4 -indent-tabs-mode: nil -End: -*/ diff --git a/Objects/stringlib/find.h b/Objects/stringlib/find.h index 4407d71..ce615dc 100644 --- a/Objects/stringlib/find.h +++ b/Objects/stringlib/find.h @@ -19,7 +19,7 @@ stringlib_find(const STRINGLIB_CHAR* str, Py_ssize_t str_len, if (sub_len == 0) return offset; - pos = fastsearch(str, str_len, sub, sub_len, FAST_SEARCH); + pos = fastsearch(str, str_len, sub, sub_len, -1, FAST_SEARCH); if (pos >= 0) pos += offset; @@ -32,42 +32,43 @@ stringlib_rfind(const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR* sub, Py_ssize_t sub_len, Py_ssize_t offset) { - /* XXX - create reversefastsearch helper! */ - if (sub_len == 0) { - if (str_len < 0) - return -1; - return str_len + offset; - } else { - Py_ssize_t j, pos = -1; - for (j = str_len - sub_len; j >= 0; --j) - if (STRINGLIB_CMP(str+j, sub, sub_len) == 0) { - pos = j + offset; - break; - } - return pos; - } + Py_ssize_t pos; + + if (str_len < 0) + return -1; + if (sub_len == 0) + return str_len + offset; + + pos = fastsearch(str, str_len, sub, sub_len, -1, FAST_RSEARCH); + + if (pos >= 0) + pos += offset; + + return pos; } +/* helper macro to fixup start/end slice values */ +#define ADJUST_INDICES(start, end, len) \ + if (end > len) \ + end = len; \ + else if (end < 0) { \ + end += len; \ + if (end < 0) \ + end = 0; \ + } \ + if (start < 0) { \ + start += len; \ + if (start < 0) \ + start = 0; \ + } + Py_LOCAL_INLINE(Py_ssize_t) stringlib_find_slice(const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR* sub, Py_ssize_t sub_len, Py_ssize_t start, Py_ssize_t end) { - if (start < 0) - start += str_len; - if (start < 0) - start = 0; - if (end > str_len) - end = str_len; - if (end < 0) - end += str_len; - if (end < 0) - end = 0; - - return stringlib_find( - str + start, end - start, - sub, sub_len, start - ); + ADJUST_INDICES(start, end, str_len); + return stringlib_find(str + start, end - start, sub, sub_len, start); } Py_LOCAL_INLINE(Py_ssize_t) @@ -75,17 +76,7 @@ stringlib_rfind_slice(const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR* sub, Py_ssize_t sub_len, Py_ssize_t start, Py_ssize_t end) { - if (start < 0) - start += str_len; - if (start < 0) - start = 0; - if (end > str_len) - end = str_len; - if (end < 0) - end += str_len; - if (end < 0) - end = 0; - + ADJUST_INDICES(start, end, str_len); return stringlib_rfind(str + start, end - start, sub, sub_len, start); } @@ -100,7 +91,7 @@ stringlib_contains_obj(PyObject* str, PyObject* sub) ) != -1; } -#endif /* STRINGLIB_STR */ +#endif /* STRINGLIB_WANT_CONTAINS_OBJ */ /* This function is a helper for the "find" family (find, rfind, index, @@ -149,7 +140,7 @@ stringlib_parse_args_finds(const char * function_name, PyObject *args, #undef FORMAT_BUFFER_SIZE -#ifdef FROM_UNICODE +#if STRINGLIB_IS_UNICODE /* Wraps stringlib_parse_args_finds() and additionally ensures that the @@ -179,13 +170,6 @@ stringlib_parse_args_finds_unicode(const char * function_name, PyObject *args, return 0; } -#endif /* FROM_UNICODE */ +#endif /* STRINGLIB_IS_UNICODE */ #endif /* STRINGLIB_FIND_H */ - -/* -Local variables: -c-basic-offset: 4 -indent-tabs-mode: nil -End: -*/ diff --git a/Objects/stringlib/formatter.h b/Objects/stringlib/formatter.h index 4770863..4fdc62d 100644 --- a/Objects/stringlib/formatter.h +++ b/Objects/stringlib/formatter.h @@ -32,7 +32,7 @@ unknown_presentation_type(STRINGLIB_CHAR presentation_type, PyErr_Format(PyExc_ValueError, "Unknown format code '%c' " "for object of type '%.200s'", - presentation_type, + (char)presentation_type, type_name); #if STRINGLIB_IS_UNICODE else @@ -44,6 +44,24 @@ unknown_presentation_type(STRINGLIB_CHAR presentation_type, #endif } +static void +invalid_comma_type(STRINGLIB_CHAR presentation_type) +{ +#if STRINGLIB_IS_UNICODE + /* See comment in unknown_presentation_type */ + if (presentation_type > 32 && presentation_type < 128) +#endif + PyErr_Format(PyExc_ValueError, + "Cannot specify ',' with '%c'.", + (char)presentation_type); +#if STRINGLIB_IS_UNICODE + else + PyErr_Format(PyExc_ValueError, + "Cannot specify ',' with '\\x%x'.", + (unsigned int)presentation_type); +#endif +} + /* get_integer consumes 0 or more decimal digit characters from an input string, updates *result with the corresponding positive @@ -277,8 +295,7 @@ parse_internal_render_format_spec(STRINGLIB_CHAR *format_spec, /* These are allowed. See PEP 378.*/ break; default: - PyErr_Format(PyExc_ValueError, - "Cannot specify ',' with '%c'.", format->type); + invalid_comma_type(format->type); return 0; } } @@ -632,8 +649,8 @@ get_locale_info(int type, LocaleInfo *locale_info) case LT_DEFAULT_LOCALE: locale_info->decimal_point = "."; locale_info->thousands_sep = ","; - locale_info->grouping = "\3"; /* Group every 3 characters, - trailing 0 means repeat + locale_info->grouping = "\3"; /* Group every 3 characters. The + (implicit) trailing 0 means repeat infinitely. */ break; case LT_NO_LOCALE: @@ -759,14 +776,6 @@ format_int_or_long_internal(PyObject *value, const InternalFormatSpec *format, goto done; } - /* Error to specify a comma. */ - if (format->thousands_separators) { - PyErr_SetString(PyExc_ValueError, - "Thousands separators not allowed with integer" - " format specifier 'c'"); - goto done; - } - /* taken from unicodeobject.c formatchar() */ /* Integer input truncated to a character */ /* XXX: won't work for int */ @@ -932,20 +941,16 @@ format_float_internal(PyObject *value, from a hard-code pseudo-locale */ LocaleInfo locale; - /* Alternate is not allowed on floats. */ - if (format->alternate) { - PyErr_SetString(PyExc_ValueError, - "Alternate form (#) not allowed in float format " - "specifier"); - goto done; - } + if (format->alternate) + flags |= Py_DTSF_ALT; if (type == '\0') { - /* Omitted type specifier. This is like 'g' but with at least one - digit after the decimal point, and different default precision.*/ - type = 'g'; - default_precision = PyFloat_STR_PRECISION; + /* Omitted type specifier. Behaves in the same way as repr(x) + and str(x) if no precision is given, else like 'g', but with + at least one digit after the decimal point. */ flags |= Py_DTSF_ADD_DOT_0; + type = 'r'; + default_precision = 0; } if (type == 'n') @@ -953,13 +958,6 @@ format_float_internal(PyObject *value, format the result. We take care of that later. */ type = 'g'; -#if PY_VERSION_HEX < 0x0301000 - /* 'F' is the same as 'f', per the PEP */ - /* This is no longer the case in 3.x */ - if (type == 'F') - type = 'f'; -#endif - val = PyFloat_AsDouble(value); if (val == -1.0 && PyErr_Occurred()) goto done; @@ -972,12 +970,8 @@ format_float_internal(PyObject *value, if (precision < 0) precision = default_precision; - -#if PY_VERSION_HEX < 0x03010000 - /* 3.1 no longer converts large 'f' to 'g'. */ - if ((type == 'f' || type == 'F') && fabs(val) >= 1e50) + else if (type == 'r') type = 'g'; -#endif /* Cast "type", because if we're in unicode we need to pass a 8-bit char. This is safe, because we've restricted what "type" @@ -1105,15 +1099,7 @@ format_complex_internal(PyObject *value, from a hard-code pseudo-locale */ LocaleInfo locale; - /* Alternate is not allowed on complex. */ - if (format->alternate) { - PyErr_SetString(PyExc_ValueError, - "Alternate form (#) not allowed in complex format " - "specifier"); - goto done; - } - - /* Neither is zero pading. */ + /* Zero padding is not allowed. */ if (format->fill_char == '0') { PyErr_SetString(PyExc_ValueError, "Zero padding is not allowed in complex format " @@ -1136,10 +1122,13 @@ format_complex_internal(PyObject *value, if (im == -1.0 && PyErr_Occurred()) goto done; + if (format->alternate) + flags |= Py_DTSF_ALT; + if (type == '\0') { /* Omitted type specifier. Should be like str(self). */ - type = 'g'; - default_precision = PyFloat_STR_PRECISION; + type = 'r'; + default_precision = 0; if (re == 0.0 && copysign(1.0, re) == 1.0) skip_re = 1; else @@ -1151,15 +1140,10 @@ format_complex_internal(PyObject *value, format the result. We take care of that later. */ type = 'g'; -#if PY_VERSION_HEX < 0x03010000 - /* This is no longer the case in 3.x */ - /* 'F' is the same as 'f', per the PEP */ - if (type == 'F') - type = 'f'; -#endif - if (precision < 0) precision = default_precision; + else if (type == 'r') + type = 'g'; /* Cast "type", because if we're in unicode we need to pass a 8-bit char. This is safe, because we've restricted what "type" diff --git a/Objects/stringlib/partition.h b/Objects/stringlib/partition.h index 20c7507..0170bdd 100644 --- a/Objects/stringlib/partition.h +++ b/Objects/stringlib/partition.h @@ -8,10 +8,10 @@ #endif Py_LOCAL_INLINE(PyObject*) -stringlib_partition( - PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, - PyObject* sep_obj, const STRINGLIB_CHAR* sep, Py_ssize_t sep_len - ) +stringlib_partition(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + PyObject* sep_obj, + const STRINGLIB_CHAR* sep, Py_ssize_t sep_len) { PyObject* out; Py_ssize_t pos; @@ -25,15 +25,21 @@ stringlib_partition( if (!out) return NULL; - pos = fastsearch(str, str_len, sep, sep_len, FAST_SEARCH); + pos = fastsearch(str, str_len, sep, sep_len, -1, FAST_SEARCH); if (pos < 0) { +#if STRINGLIB_MUTABLE + PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(str, str_len)); + PyTuple_SET_ITEM(out, 1, STRINGLIB_NEW(NULL, 0)); + PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(NULL, 0)); +#else Py_INCREF(str_obj); PyTuple_SET_ITEM(out, 0, (PyObject*) str_obj); Py_INCREF(STRINGLIB_EMPTY); PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY); Py_INCREF(STRINGLIB_EMPTY); PyTuple_SET_ITEM(out, 2, (PyObject*) STRINGLIB_EMPTY); +#endif return out; } @@ -52,13 +58,13 @@ stringlib_partition( } Py_LOCAL_INLINE(PyObject*) -stringlib_rpartition( - PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, - PyObject* sep_obj, const STRINGLIB_CHAR* sep, Py_ssize_t sep_len - ) +stringlib_rpartition(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + PyObject* sep_obj, + const STRINGLIB_CHAR* sep, Py_ssize_t sep_len) { PyObject* out; - Py_ssize_t pos, j; + Py_ssize_t pos; if (sep_len == 0) { PyErr_SetString(PyExc_ValueError, "empty separator"); @@ -69,21 +75,21 @@ stringlib_rpartition( if (!out) return NULL; - /* XXX - create reversefastsearch helper! */ - pos = -1; - for (j = str_len - sep_len; j >= 0; --j) - if (STRINGLIB_CMP(str+j, sep, sep_len) == 0) { - pos = j; - break; - } + pos = fastsearch(str, str_len, sep, sep_len, -1, FAST_RSEARCH); if (pos < 0) { +#if STRINGLIB_MUTABLE + PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(NULL, 0)); + PyTuple_SET_ITEM(out, 1, STRINGLIB_NEW(NULL, 0)); + PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(str, str_len)); +#else Py_INCREF(STRINGLIB_EMPTY); PyTuple_SET_ITEM(out, 0, (PyObject*) STRINGLIB_EMPTY); Py_INCREF(STRINGLIB_EMPTY); PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY); Py_INCREF(str_obj); PyTuple_SET_ITEM(out, 2, (PyObject*) str_obj); +#endif return out; } @@ -102,10 +108,3 @@ stringlib_rpartition( } #endif - -/* -Local variables: -c-basic-offset: 4 -indent-tabs-mode: nil -End: -*/ diff --git a/Objects/stringlib/split.h b/Objects/stringlib/split.h new file mode 100644 index 0000000..60e7767 --- /dev/null +++ b/Objects/stringlib/split.h @@ -0,0 +1,394 @@ +/* stringlib: split implementation */ + +#ifndef STRINGLIB_SPLIT_H +#define STRINGLIB_SPLIT_H + +#ifndef STRINGLIB_FASTSEARCH_H +#error must include "stringlib/fastsearch.h" before including this module +#endif + +/* Overallocate the initial list to reduce the number of reallocs for small + split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three + resizes, to sizes 4, 8, then 16. Most observed string splits are for human + text (roughly 11 words per line) and field delimited data (usually 1-10 + fields). For large strings the split algorithms are bandwidth limited + so increasing the preallocation likely will not improve things.*/ + +#define MAX_PREALLOC 12 + +/* 5 splits gives 6 elements */ +#define PREALLOC_SIZE(maxsplit) \ + (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1) + +#define SPLIT_APPEND(data, left, right) \ + sub = STRINGLIB_NEW((data) + (left), \ + (right) - (left)); \ + if (sub == NULL) \ + goto onError; \ + if (PyList_Append(list, sub)) { \ + Py_DECREF(sub); \ + goto onError; \ + } \ + else \ + Py_DECREF(sub); + +#define SPLIT_ADD(data, left, right) { \ + sub = STRINGLIB_NEW((data) + (left), \ + (right) - (left)); \ + if (sub == NULL) \ + goto onError; \ + if (count < MAX_PREALLOC) { \ + PyList_SET_ITEM(list, count, sub); \ + } else { \ + if (PyList_Append(list, sub)) { \ + Py_DECREF(sub); \ + goto onError; \ + } \ + else \ + Py_DECREF(sub); \ + } \ + count++; } + + +/* Always force the list to the expected size. */ +#define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count + +Py_LOCAL_INLINE(PyObject *) +stringlib_split_whitespace(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, count=0; + PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *sub; + + if (list == NULL) + return NULL; + + i = j = 0; + while (maxcount-- > 0) { + while (i < str_len && STRINGLIB_ISSPACE(str[i])) + i++; + if (i == str_len) break; + j = i; i++; + while (i < str_len && !STRINGLIB_ISSPACE(str[i])) + i++; +#ifndef STRINGLIB_MUTABLE + if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No whitespace in str_obj, so just use it as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + break; + } +#endif + SPLIT_ADD(str, j, i); + } + + if (i < str_len) { + /* Only occurs when maxcount was reached */ + /* Skip any remaining whitespace and copy to end of string */ + while (i < str_len && STRINGLIB_ISSPACE(str[i])) + i++; + if (i != str_len) + SPLIT_ADD(str, i, str_len); + } + FIX_PREALLOC_SIZE(list); + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +stringlib_split_char(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR ch, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, count=0; + PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *sub; + + if (list == NULL) + return NULL; + + i = j = 0; + while ((j < str_len) && (maxcount-- > 0)) { + for(; j < str_len; j++) { + /* I found that using memchr makes no difference */ + if (str[j] == ch) { + SPLIT_ADD(str, i, j); + i = j = j + 1; + break; + } + } + } +#ifndef STRINGLIB_MUTABLE + if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* ch not in str_obj, so just use str_obj as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif + if (i <= str_len) { + SPLIT_ADD(str, i, str_len); + } + FIX_PREALLOC_SIZE(list); + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +stringlib_split(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, pos, count=0; + PyObject *list, *sub; + + if (sep_len == 0) { + PyErr_SetString(PyExc_ValueError, "empty separator"); + return NULL; + } + else if (sep_len == 1) + return stringlib_split_char(str_obj, str, str_len, sep[0], maxcount); + + list = PyList_New(PREALLOC_SIZE(maxcount)); + if (list == NULL) + return NULL; + + i = j = 0; + while (maxcount-- > 0) { + pos = fastsearch(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH); + if (pos < 0) + break; + j = i + pos; + SPLIT_ADD(str, i, j); + i = j + sep_len; + } +#ifndef STRINGLIB_MUTABLE + if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No match in str_obj, so just use it as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif + { + SPLIT_ADD(str, i, str_len); + } + FIX_PREALLOC_SIZE(list); + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +stringlib_rsplit_whitespace(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, count=0; + PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *sub; + + if (list == NULL) + return NULL; + + i = j = str_len - 1; + while (maxcount-- > 0) { + while (i >= 0 && STRINGLIB_ISSPACE(str[i])) + i--; + if (i < 0) break; + j = i; i--; + while (i >= 0 && !STRINGLIB_ISSPACE(str[i])) + i--; +#ifndef STRINGLIB_MUTABLE + if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No whitespace in str_obj, so just use it as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + break; + } +#endif + SPLIT_ADD(str, i + 1, j + 1); + } + + if (i >= 0) { + /* Only occurs when maxcount was reached */ + /* Skip any remaining whitespace and copy to beginning of string */ + while (i >= 0 && STRINGLIB_ISSPACE(str[i])) + i--; + if (i >= 0) + SPLIT_ADD(str, 0, i + 1); + } + FIX_PREALLOC_SIZE(list); + if (PyList_Reverse(list) < 0) + goto onError; + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +stringlib_rsplit_char(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR ch, + Py_ssize_t maxcount) +{ + Py_ssize_t i, j, count=0; + PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); + PyObject *sub; + + if (list == NULL) + return NULL; + + i = j = str_len - 1; + while ((i >= 0) && (maxcount-- > 0)) { + for(; i >= 0; i--) { + if (str[i] == ch) { + SPLIT_ADD(str, i + 1, j + 1); + j = i = i - 1; + break; + } + } + } +#ifndef STRINGLIB_MUTABLE + if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* ch not in str_obj, so just use str_obj as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif + if (j >= -1) { + SPLIT_ADD(str, 0, j + 1); + } + FIX_PREALLOC_SIZE(list); + if (PyList_Reverse(list) < 0) + goto onError; + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +stringlib_rsplit(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, + Py_ssize_t maxcount) +{ + Py_ssize_t j, pos, count=0; + PyObject *list, *sub; + + if (sep_len == 0) { + PyErr_SetString(PyExc_ValueError, "empty separator"); + return NULL; + } + else if (sep_len == 1) + return stringlib_rsplit_char(str_obj, str, str_len, sep[0], maxcount); + + list = PyList_New(PREALLOC_SIZE(maxcount)); + if (list == NULL) + return NULL; + + j = str_len; + while (maxcount-- > 0) { + pos = fastsearch(str, j, sep, sep_len, -1, FAST_RSEARCH); + if (pos < 0) + break; + SPLIT_ADD(str, pos + sep_len, j); + j = pos; + } +#ifndef STRINGLIB_MUTABLE + if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No match in str_obj, so just use it as list[0] */ + Py_INCREF(str_obj); + PyList_SET_ITEM(list, 0, (PyObject *)str_obj); + count++; + } else +#endif + { + SPLIT_ADD(str, 0, j); + } + FIX_PREALLOC_SIZE(list); + if (PyList_Reverse(list) < 0) + goto onError; + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +Py_LOCAL_INLINE(PyObject *) +stringlib_splitlines(PyObject* str_obj, + const STRINGLIB_CHAR* str, Py_ssize_t str_len, + int keepends) +{ + /* This does not use the preallocated list because splitlines is + usually run with hundreds of newlines. The overhead of + switching between PyList_SET_ITEM and append causes about a + 2-3% slowdown for that common case. A smarter implementation + could move the if check out, so the SET_ITEMs are done first + and the appends only done when the prealloc buffer is full. + That's too much work for little gain.*/ + + register Py_ssize_t i; + register Py_ssize_t j; + PyObject *list = PyList_New(0); + PyObject *sub; + + if (list == NULL) + return NULL; + + for (i = j = 0; i < str_len; ) { + Py_ssize_t eol; + + /* Find a line and append it */ + while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i])) + i++; + + /* Skip the line break reading CRLF as one line break */ + eol = i; + if (i < str_len) { + if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n') + i += 2; + else + i++; + if (keepends) + eol = i; + } +#ifndef STRINGLIB_MUTABLE + if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) { + /* No linebreak in str_obj, so just use it as list[0] */ + if (PyList_Append(list, str_obj)) + goto onError; + break; + } +#endif + SPLIT_APPEND(str, j, eol); + j = i; + } + return list; + + onError: + Py_DECREF(list); + return NULL; +} + +#endif diff --git a/Objects/stringlib/string_format.h b/Objects/stringlib/string_format.h index b2095fd..6f10727 100644 --- a/Objects/stringlib/string_format.h +++ b/Objects/stringlib/string_format.h @@ -499,13 +499,16 @@ get_field_object(SubString *input, PyObject *args, PyObject *kwargs, PyObject *key = SubString_new_object(&first); if (key == NULL) goto error; - if ((kwargs == NULL) || (obj = PyDict_GetItem(kwargs, key)) == NULL) { + + /* Use PyObject_GetItem instead of PyDict_GetItem because this + code is no longer just used with kwargs. It might be passed + a non-dict when called through format_map. */ + if ((kwargs == NULL) || (obj = PyObject_GetItem(kwargs, key)) == NULL) { PyErr_SetObject(PyExc_KeyError, key); Py_DECREF(key); goto error; } Py_DECREF(key); - Py_INCREF(obj); } else { /* look up in args */ @@ -1039,6 +1042,11 @@ do_string_format(PyObject *self, PyObject *args, PyObject *kwargs) return build_string(&input, args, kwargs, recursion_depth, &auto_number); } +static PyObject * +do_string_format_map(PyObject *self, PyObject *obj) +{ + return do_string_format(self, NULL, obj); +} /************************************************************************/ @@ -1180,10 +1188,15 @@ static PyTypeObject PyFormatterIter_Type = { describing the parsed elements. It's a wrapper around stringlib/string_format.h's MarkupIterator */ static PyObject * -formatter_parser(STRINGLIB_OBJECT *self) +formatter_parser(PyObject *ignored, STRINGLIB_OBJECT *self) { formatteriterobject *it; + if (!PyUnicode_Check(self)) { + PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name); + return NULL; + } + it = PyObject_New(formatteriterobject, &PyFormatterIter_Type); if (it == NULL) return NULL; @@ -1315,7 +1328,7 @@ static PyTypeObject PyFieldNameIter_Type = { field_name_split. The iterator it returns is a FieldNameIterator */ static PyObject * -formatter_field_name_split(STRINGLIB_OBJECT *self) +formatter_field_name_split(PyObject *ignored, STRINGLIB_OBJECT *self) { SubString first; Py_ssize_t first_idx; @@ -1324,6 +1337,11 @@ formatter_field_name_split(STRINGLIB_OBJECT *self) PyObject *first_obj = NULL; PyObject *result = NULL; + if (!PyUnicode_Check(self)) { + PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name); + return NULL; + } + it = PyObject_New(fieldnameiterobject, &PyFieldNameIter_Type); if (it == NULL) return NULL; diff --git a/Objects/stringlib/stringdefs.h b/Objects/stringlib/stringdefs.h index a5672c7..1c49426 100644 --- a/Objects/stringlib/stringdefs.h +++ b/Objects/stringlib/stringdefs.h @@ -11,6 +11,8 @@ #define STRINGLIB_TYPE_NAME "string" #define STRINGLIB_PARSE_CODE "S" #define STRINGLIB_EMPTY nullstring +#define STRINGLIB_ISSPACE Py_ISSPACE +#define STRINGLIB_ISLINEBREAK(x) ((x == '\n') || (x == '\r')) #define STRINGLIB_ISDECIMAL(x) ((x >= '0') && (x <= '9')) #define STRINGLIB_TODECIMAL(x) (STRINGLIB_ISDECIMAL(x) ? (x - '0') : -1) #define STRINGLIB_TOUPPER Py_TOUPPER @@ -21,7 +23,7 @@ #define STRINGLIB_NEW PyBytes_FromStringAndSize #define STRINGLIB_RESIZE _PyBytes_Resize #define STRINGLIB_CHECK PyBytes_Check -#define STRINGLIB_CMP memcmp +#define STRINGLIB_CHECK_EXACT PyBytes_CheckExact #define STRINGLIB_TOSTR PyObject_Str #define STRINGLIB_GROUPING _PyBytes_InsertThousandsGrouping #define STRINGLIB_GROUPING_LOCALE _PyBytes_InsertThousandsGroupingLocale diff --git a/Objects/stringlib/transmogrify.h b/Objects/stringlib/transmogrify.h index 4390e22..1e132e5 100644 --- a/Objects/stringlib/transmogrify.h +++ b/Objects/stringlib/transmogrify.h @@ -1,13 +1,6 @@ /* NOTE: this API is -ONLY- for use with single byte character strings. */ /* Do not use it with Unicode. */ -#include "bytes_methods.h" - -#ifndef STRINGLIB_MUTABLE -#warning "STRINGLIB_MUTABLE not defined before #include, assuming 0" -#define STRINGLIB_MUTABLE 0 -#endif - /* the more complicated methods. parts of these should be pulled out into the shared code in bytes_methods.c to cut down on duplicate code bloat. */ @@ -25,10 +18,10 @@ stringlib_expandtabs(PyObject *self, PyObject *args) size_t i, j; PyObject *u; int tabsize = 8; - + if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) return NULL; - + /* First pass: determine size of output string */ i = j = 0; e = STRINGLIB_STR(self) + STRINGLIB_LEN(self); @@ -55,20 +48,20 @@ stringlib_expandtabs(PyObject *self, PyObject *args) } } } - + if ((i + j) > PY_SSIZE_T_MAX) { PyErr_SetString(PyExc_OverflowError, "result is too long"); return NULL; } - + /* Second pass: create output string and fill it */ u = STRINGLIB_NEW(NULL, i + j); if (!u) return NULL; - + j = 0; q = STRINGLIB_STR(u); - + for (p = STRINGLIB_STR(self); p < e; p++) if (*p == '\t') { if (tabsize > 0) { @@ -84,7 +77,7 @@ stringlib_expandtabs(PyObject *self, PyObject *args) if (*p == '\n' || *p == '\r') j = 0; } - + return u; } @@ -110,16 +103,16 @@ pad(PyObject *self, Py_ssize_t left, Py_ssize_t right, char fill) } u = STRINGLIB_NEW(NULL, - left + STRINGLIB_LEN(self) + right); + left + STRINGLIB_LEN(self) + right); if (u) { if (left) memset(STRINGLIB_STR(u), fill, left); Py_MEMCPY(STRINGLIB_STR(u) + left, - STRINGLIB_STR(self), - STRINGLIB_LEN(self)); + STRINGLIB_STR(self), + STRINGLIB_LEN(self)); if (right) memset(STRINGLIB_STR(u) + left + STRINGLIB_LEN(self), - fill, right); + fill, right); } return u; @@ -269,87 +262,3 @@ stringlib_zfill(PyObject *self, PyObject *args) return (PyObject*) s; } - - -#define _STRINGLIB_SPLIT_APPEND(data, left, right) \ - str = STRINGLIB_NEW((data) + (left), \ - (right) - (left)); \ - if (str == NULL) \ - goto onError; \ - if (PyList_Append(list, str)) { \ - Py_DECREF(str); \ - goto onError; \ - } \ - else \ - Py_DECREF(str); - -PyDoc_STRVAR(splitlines__doc__, -"B.splitlines([keepends]) -> list of lines\n\ -\n\ -Return a list of the lines in B, breaking at line boundaries.\n\ -Line breaks are not included in the resulting list unless keepends\n\ -is given and true."); - -static PyObject* -stringlib_splitlines(PyObject *self, PyObject *args) -{ - register Py_ssize_t i; - register Py_ssize_t j; - Py_ssize_t len; - int keepends = 0; - PyObject *list; - PyObject *str; - char *data; - - if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) - return NULL; - - data = STRINGLIB_STR(self); - len = STRINGLIB_LEN(self); - - /* This does not use the preallocated list because splitlines is - usually run with hundreds of newlines. The overhead of - switching between PyList_SET_ITEM and append causes about a - 2-3% slowdown for that common case. A smarter implementation - could move the if check out, so the SET_ITEMs are done first - and the appends only done when the prealloc buffer is full. - That's too much work for little gain.*/ - - list = PyList_New(0); - if (!list) - goto onError; - - for (i = j = 0; i < len; ) { - Py_ssize_t eol; - - /* Find a line and append it */ - while (i < len && data[i] != '\n' && data[i] != '\r') - i++; - - /* Skip the line break reading CRLF as one line break */ - eol = i; - if (i < len) { - if (data[i] == '\r' && i + 1 < len && - data[i+1] == '\n') - i += 2; - else - i++; - if (keepends) - eol = i; - } - _STRINGLIB_SPLIT_APPEND(data, j, eol); - j = i; - } - if (j < len) { - _STRINGLIB_SPLIT_APPEND(data, j, len); - } - - return list; - - onError: - Py_XDECREF(list); - return NULL; -} - -#undef _STRINGLIB_SPLIT_APPEND - diff --git a/Objects/stringlib/unicodedefs.h b/Objects/stringlib/unicodedefs.h index 366acfe..09dae6d 100644 --- a/Objects/stringlib/unicodedefs.h +++ b/Objects/stringlib/unicodedefs.h @@ -11,6 +11,8 @@ #define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_PARSE_CODE "U" #define STRINGLIB_EMPTY unicode_empty +#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE +#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK #define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL #define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL #define STRINGLIB_TOUPPER Py_UNICODE_TOUPPER @@ -21,6 +23,7 @@ #define STRINGLIB_NEW PyUnicode_FromUnicode #define STRINGLIB_RESIZE PyUnicode_Resize #define STRINGLIB_CHECK PyUnicode_Check +#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact #define STRINGLIB_GROUPING _PyUnicode_InsertThousandsGrouping #define STRINGLIB_GROUPING_LOCALE _PyUnicode_InsertThousandsGroupingLocale @@ -34,23 +37,4 @@ #define STRINGLIB_WANT_CONTAINS_OBJ 1 -/* STRINGLIB_CMP was defined as: - -Py_LOCAL_INLINE(int) -STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len) -{ - if (str[0] != other[0]) - return 1; - return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE)); -} - -but unfortunately that gives a error if the function isn't used in a file that -includes this file. So, reluctantly convert it to a macro instead. */ - -#define STRINGLIB_CMP(str, other, len) \ - (((str)[0] != (other)[0]) ? \ - 1 : \ - memcmp((void*) (str), (void*) (other), (len) * sizeof(Py_UNICODE))) - - #endif /* !STRINGLIB_UNICODEDEFS_H */ |