diff options
Diffstat (limited to 'Objects/stringlib')
24 files changed, 2313 insertions, 2683 deletions
diff --git a/Objects/stringlib/README.txt b/Objects/stringlib/README.txt index 8ff6ad8..ab506d6 100644 --- a/Objects/stringlib/README.txt +++ b/Objects/stringlib/README.txt @@ -1,4 +1,4 @@ -bits shared by the bytesobject and unicodeobject implementations (and +bits shared by the stringobject and unicodeobject implementations (and possibly other modules, in a not too distant future). the stuff in here is included into relevant places; see the individual diff --git a/Objects/stringlib/asciilib.h b/Objects/stringlib/asciilib.h deleted file mode 100644 index d0fc18d..0000000 --- a/Objects/stringlib/asciilib.h +++ /dev/null @@ -1,29 +0,0 @@ -/* this is sort of a hack. there's at least one place (formatting - floats) where some stringlib code takes a different path if it's - compiled as unicode. */ -#define STRINGLIB_IS_UNICODE 1 - -#define FASTSEARCH asciilib_fastsearch -#define STRINGLIB(F) asciilib_##F -#define STRINGLIB_OBJECT PyUnicodeObject -#define STRINGLIB_SIZEOF_CHAR 1 -#define STRINGLIB_MAX_CHAR 0x7Fu -#define STRINGLIB_CHAR Py_UCS1 -#define STRINGLIB_TYPE_NAME "unicode" -#define STRINGLIB_PARSE_CODE "U" -#define STRINGLIB_EMPTY unicode_empty -#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE -#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK -#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL -#define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL -#define STRINGLIB_STR PyUnicode_1BYTE_DATA -#define STRINGLIB_LEN PyUnicode_GET_LENGTH -#define STRINGLIB_NEW(STR,LEN) _PyUnicode_FromASCII((char*)(STR),(LEN)) -#define STRINGLIB_CHECK PyUnicode_Check -#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact - -#define STRINGLIB_TOSTR PyObject_Str -#define STRINGLIB_TOASCII PyObject_ASCII - -#define _Py_InsertThousandsGrouping _PyUnicode_ascii_InsertThousandsGrouping - diff --git a/Objects/stringlib/clinic/transmogrify.h.h b/Objects/stringlib/clinic/transmogrify.h.h deleted file mode 100644 index 8a3a060..0000000 --- a/Objects/stringlib/clinic/transmogrify.h.h +++ /dev/null @@ -1,277 +0,0 @@ -/*[clinic input] -preserve -[clinic start generated code]*/ - -PyDoc_STRVAR(stringlib_expandtabs__doc__, -"expandtabs($self, /, tabsize=8)\n" -"--\n" -"\n" -"Return a copy where all tab characters are expanded using spaces.\n" -"\n" -"If tabsize is not given, a tab size of 8 characters is assumed."); - -#define STRINGLIB_EXPANDTABS_METHODDEF \ - {"expandtabs", (PyCFunction)(void(*)(void))stringlib_expandtabs, METH_FASTCALL|METH_KEYWORDS, stringlib_expandtabs__doc__}, - -static PyObject * -stringlib_expandtabs_impl(PyObject *self, int tabsize); - -static PyObject * -stringlib_expandtabs(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) -{ - PyObject *return_value = NULL; - static const char * const _keywords[] = {"tabsize", NULL}; - static _PyArg_Parser _parser = {NULL, _keywords, "expandtabs", 0}; - PyObject *argsbuf[1]; - Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0; - int tabsize = 8; - - args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 0, 1, 0, argsbuf); - if (!args) { - goto exit; - } - if (!noptargs) { - goto skip_optional_pos; - } - if (PyFloat_Check(args[0])) { - PyErr_SetString(PyExc_TypeError, - "integer argument expected, got float" ); - goto exit; - } - tabsize = _PyLong_AsInt(args[0]); - if (tabsize == -1 && PyErr_Occurred()) { - goto exit; - } -skip_optional_pos: - return_value = stringlib_expandtabs_impl(self, tabsize); - -exit: - return return_value; -} - -PyDoc_STRVAR(stringlib_ljust__doc__, -"ljust($self, width, fillchar=b\' \', /)\n" -"--\n" -"\n" -"Return a left-justified string of length width.\n" -"\n" -"Padding is done using the specified fill character."); - -#define STRINGLIB_LJUST_METHODDEF \ - {"ljust", (PyCFunction)(void(*)(void))stringlib_ljust, METH_FASTCALL, stringlib_ljust__doc__}, - -static PyObject * -stringlib_ljust_impl(PyObject *self, Py_ssize_t width, char fillchar); - -static PyObject * -stringlib_ljust(PyObject *self, PyObject *const *args, Py_ssize_t nargs) -{ - PyObject *return_value = NULL; - Py_ssize_t width; - char fillchar = ' '; - - if (!_PyArg_CheckPositional("ljust", nargs, 1, 2)) { - goto exit; - } - if (PyFloat_Check(args[0])) { - PyErr_SetString(PyExc_TypeError, - "integer argument expected, got float" ); - goto exit; - } - { - Py_ssize_t ival = -1; - PyObject *iobj = PyNumber_Index(args[0]); - if (iobj != NULL) { - ival = PyLong_AsSsize_t(iobj); - Py_DECREF(iobj); - } - if (ival == -1 && PyErr_Occurred()) { - goto exit; - } - width = ival; - } - if (nargs < 2) { - goto skip_optional; - } - if (PyBytes_Check(args[1]) && PyBytes_GET_SIZE(args[1]) == 1) { - fillchar = PyBytes_AS_STRING(args[1])[0]; - } - else if (PyByteArray_Check(args[1]) && PyByteArray_GET_SIZE(args[1]) == 1) { - fillchar = PyByteArray_AS_STRING(args[1])[0]; - } - else { - _PyArg_BadArgument("ljust", "argument 2", "a byte string of length 1", args[1]); - goto exit; - } -skip_optional: - return_value = stringlib_ljust_impl(self, width, fillchar); - -exit: - return return_value; -} - -PyDoc_STRVAR(stringlib_rjust__doc__, -"rjust($self, width, fillchar=b\' \', /)\n" -"--\n" -"\n" -"Return a right-justified string of length width.\n" -"\n" -"Padding is done using the specified fill character."); - -#define STRINGLIB_RJUST_METHODDEF \ - {"rjust", (PyCFunction)(void(*)(void))stringlib_rjust, METH_FASTCALL, stringlib_rjust__doc__}, - -static PyObject * -stringlib_rjust_impl(PyObject *self, Py_ssize_t width, char fillchar); - -static PyObject * -stringlib_rjust(PyObject *self, PyObject *const *args, Py_ssize_t nargs) -{ - PyObject *return_value = NULL; - Py_ssize_t width; - char fillchar = ' '; - - if (!_PyArg_CheckPositional("rjust", nargs, 1, 2)) { - goto exit; - } - if (PyFloat_Check(args[0])) { - PyErr_SetString(PyExc_TypeError, - "integer argument expected, got float" ); - goto exit; - } - { - Py_ssize_t ival = -1; - PyObject *iobj = PyNumber_Index(args[0]); - if (iobj != NULL) { - ival = PyLong_AsSsize_t(iobj); - Py_DECREF(iobj); - } - if (ival == -1 && PyErr_Occurred()) { - goto exit; - } - width = ival; - } - if (nargs < 2) { - goto skip_optional; - } - if (PyBytes_Check(args[1]) && PyBytes_GET_SIZE(args[1]) == 1) { - fillchar = PyBytes_AS_STRING(args[1])[0]; - } - else if (PyByteArray_Check(args[1]) && PyByteArray_GET_SIZE(args[1]) == 1) { - fillchar = PyByteArray_AS_STRING(args[1])[0]; - } - else { - _PyArg_BadArgument("rjust", "argument 2", "a byte string of length 1", args[1]); - goto exit; - } -skip_optional: - return_value = stringlib_rjust_impl(self, width, fillchar); - -exit: - return return_value; -} - -PyDoc_STRVAR(stringlib_center__doc__, -"center($self, width, fillchar=b\' \', /)\n" -"--\n" -"\n" -"Return a centered string of length width.\n" -"\n" -"Padding is done using the specified fill character."); - -#define STRINGLIB_CENTER_METHODDEF \ - {"center", (PyCFunction)(void(*)(void))stringlib_center, METH_FASTCALL, stringlib_center__doc__}, - -static PyObject * -stringlib_center_impl(PyObject *self, Py_ssize_t width, char fillchar); - -static PyObject * -stringlib_center(PyObject *self, PyObject *const *args, Py_ssize_t nargs) -{ - PyObject *return_value = NULL; - Py_ssize_t width; - char fillchar = ' '; - - if (!_PyArg_CheckPositional("center", nargs, 1, 2)) { - goto exit; - } - if (PyFloat_Check(args[0])) { - PyErr_SetString(PyExc_TypeError, - "integer argument expected, got float" ); - goto exit; - } - { - Py_ssize_t ival = -1; - PyObject *iobj = PyNumber_Index(args[0]); - if (iobj != NULL) { - ival = PyLong_AsSsize_t(iobj); - Py_DECREF(iobj); - } - if (ival == -1 && PyErr_Occurred()) { - goto exit; - } - width = ival; - } - if (nargs < 2) { - goto skip_optional; - } - if (PyBytes_Check(args[1]) && PyBytes_GET_SIZE(args[1]) == 1) { - fillchar = PyBytes_AS_STRING(args[1])[0]; - } - else if (PyByteArray_Check(args[1]) && PyByteArray_GET_SIZE(args[1]) == 1) { - fillchar = PyByteArray_AS_STRING(args[1])[0]; - } - else { - _PyArg_BadArgument("center", "argument 2", "a byte string of length 1", args[1]); - goto exit; - } -skip_optional: - return_value = stringlib_center_impl(self, width, fillchar); - -exit: - return return_value; -} - -PyDoc_STRVAR(stringlib_zfill__doc__, -"zfill($self, width, /)\n" -"--\n" -"\n" -"Pad a numeric string with zeros on the left, to fill a field of the given width.\n" -"\n" -"The original string is never truncated."); - -#define STRINGLIB_ZFILL_METHODDEF \ - {"zfill", (PyCFunction)stringlib_zfill, METH_O, stringlib_zfill__doc__}, - -static PyObject * -stringlib_zfill_impl(PyObject *self, Py_ssize_t width); - -static PyObject * -stringlib_zfill(PyObject *self, PyObject *arg) -{ - PyObject *return_value = NULL; - Py_ssize_t width; - - if (PyFloat_Check(arg)) { - PyErr_SetString(PyExc_TypeError, - "integer argument expected, got float" ); - goto exit; - } - { - Py_ssize_t ival = -1; - PyObject *iobj = PyNumber_Index(arg); - if (iobj != NULL) { - ival = PyLong_AsSsize_t(iobj); - Py_DECREF(iobj); - } - if (ival == -1 && PyErr_Occurred()) { - goto exit; - } - width = ival; - } - return_value = stringlib_zfill_impl(self, width); - -exit: - return return_value; -} -/*[clinic end generated code: output=15be047aef999b4e input=a9049054013a1b77]*/ diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h deleted file mode 100644 index d6f2b98..0000000 --- a/Objects/stringlib/codecs.h +++ /dev/null @@ -1,822 +0,0 @@ -/* stringlib: codec implementations */ - -#if !STRINGLIB_IS_UNICODE -# error "codecs.h is specific to Unicode" -#endif - -/* Mask to quickly check whether a C 'long' contains a - non-ASCII, UTF8-encoded char. */ -#if (SIZEOF_LONG == 8) -# define ASCII_CHAR_MASK 0x8080808080808080UL -#elif (SIZEOF_LONG == 4) -# define ASCII_CHAR_MASK 0x80808080UL -#else -# error C 'long' size should be either 4 or 8! -#endif - -/* 10xxxxxx */ -#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0) - -Py_LOCAL_INLINE(Py_UCS4) -STRINGLIB(utf8_decode)(const char **inptr, const char *end, - STRINGLIB_CHAR *dest, - Py_ssize_t *outpos) -{ - Py_UCS4 ch; - const char *s = *inptr; - const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); - STRINGLIB_CHAR *p = dest + *outpos; - - while (s < end) { - ch = (unsigned char)*s; - - if (ch < 0x80) { - /* Fast path for runs of ASCII characters. Given that common UTF-8 - input will consist of an overwhelming majority of ASCII - characters, we try to optimize for this case by checking - as many characters as a C 'long' can contain. - First, check if we can do an aligned read, as most CPUs have - a penalty for unaligned reads. - */ - if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) { - /* Help register allocation */ - const char *_s = s; - STRINGLIB_CHAR *_p = p; - while (_s < aligned_end) { - /* Read a whole long at a time (either 4 or 8 bytes), - and do a fast unrolled copy if it only contains ASCII - characters. */ - unsigned long value = *(unsigned long *) _s; - if (value & ASCII_CHAR_MASK) - break; -#if PY_LITTLE_ENDIAN - _p[0] = (STRINGLIB_CHAR)(value & 0xFFu); - _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); - _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); - _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); -# if SIZEOF_LONG == 8 - _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); - _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); - _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); - _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); -# endif -#else -# if SIZEOF_LONG == 8 - _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu); - _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu); - _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu); - _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu); - _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); - _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); - _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); - _p[7] = (STRINGLIB_CHAR)(value & 0xFFu); -# else - _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu); - _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu); - _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu); - _p[3] = (STRINGLIB_CHAR)(value & 0xFFu); -# endif -#endif - _s += SIZEOF_LONG; - _p += SIZEOF_LONG; - } - s = _s; - p = _p; - if (s == end) - break; - ch = (unsigned char)*s; - } - if (ch < 0x80) { - s++; - *p++ = ch; - continue; - } - } - - if (ch < 0xE0) { - /* \xC2\x80-\xDF\xBF -- 0080-07FF */ - Py_UCS4 ch2; - if (ch < 0xC2) { - /* invalid sequence - \x80-\xBF -- continuation byte - \xC0-\xC1 -- fake 0000-007F */ - goto InvalidStart; - } - if (end - s < 2) { - /* unexpected end of data: the caller will decide whether - it's an error or not */ - break; - } - ch2 = (unsigned char)s[1]; - if (!IS_CONTINUATION_BYTE(ch2)) - /* invalid continuation byte */ - goto InvalidContinuation1; - ch = (ch << 6) + ch2 - - ((0xC0 << 6) + 0x80); - assert ((ch > 0x007F) && (ch <= 0x07FF)); - s += 2; - if (STRINGLIB_MAX_CHAR <= 0x007F || - (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR)) - /* Out-of-range */ - goto Return; - *p++ = ch; - continue; - } - - if (ch < 0xF0) { - /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ - Py_UCS4 ch2, ch3; - if (end - s < 3) { - /* unexpected end of data: the caller will decide whether - it's an error or not */ - if (end - s < 2) - break; - ch2 = (unsigned char)s[1]; - if (!IS_CONTINUATION_BYTE(ch2) || - (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) - /* for clarification see comments below */ - goto InvalidContinuation1; - break; - } - ch2 = (unsigned char)s[1]; - ch3 = (unsigned char)s[2]; - if (!IS_CONTINUATION_BYTE(ch2)) { - /* invalid continuation byte */ - goto InvalidContinuation1; - } - if (ch == 0xE0) { - if (ch2 < 0xA0) - /* invalid sequence - \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ - goto InvalidContinuation1; - } else if (ch == 0xED && ch2 >= 0xA0) { - /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF - will result in surrogates in range D800-DFFF. Surrogates are - not valid UTF-8 so they are rejected. - See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf - (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ - goto InvalidContinuation1; - } - if (!IS_CONTINUATION_BYTE(ch3)) { - /* invalid continuation byte */ - goto InvalidContinuation2; - } - ch = (ch << 12) + (ch2 << 6) + ch3 - - ((0xE0 << 12) + (0x80 << 6) + 0x80); - assert ((ch > 0x07FF) && (ch <= 0xFFFF)); - s += 3; - if (STRINGLIB_MAX_CHAR <= 0x07FF || - (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR)) - /* Out-of-range */ - goto Return; - *p++ = ch; - continue; - } - - if (ch < 0xF5) { - /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ - Py_UCS4 ch2, ch3, ch4; - if (end - s < 4) { - /* unexpected end of data: the caller will decide whether - it's an error or not */ - if (end - s < 2) - break; - ch2 = (unsigned char)s[1]; - if (!IS_CONTINUATION_BYTE(ch2) || - (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) - /* for clarification see comments below */ - goto InvalidContinuation1; - if (end - s < 3) - break; - ch3 = (unsigned char)s[2]; - if (!IS_CONTINUATION_BYTE(ch3)) - goto InvalidContinuation2; - break; - } - ch2 = (unsigned char)s[1]; - ch3 = (unsigned char)s[2]; - ch4 = (unsigned char)s[3]; - if (!IS_CONTINUATION_BYTE(ch2)) { - /* invalid continuation byte */ - goto InvalidContinuation1; - } - if (ch == 0xF0) { - if (ch2 < 0x90) - /* invalid sequence - \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */ - goto InvalidContinuation1; - } else if (ch == 0xF4 && ch2 >= 0x90) { - /* invalid sequence - \xF4\x90\x80\x80- -- 110000- overflow */ - goto InvalidContinuation1; - } - if (!IS_CONTINUATION_BYTE(ch3)) { - /* invalid continuation byte */ - goto InvalidContinuation2; - } - if (!IS_CONTINUATION_BYTE(ch4)) { - /* invalid continuation byte */ - goto InvalidContinuation3; - } - ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - - ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); - assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); - s += 4; - if (STRINGLIB_MAX_CHAR <= 0xFFFF || - (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) - /* Out-of-range */ - goto Return; - *p++ = ch; - continue; - } - goto InvalidStart; - } - ch = 0; -Return: - *inptr = s; - *outpos = p - dest; - return ch; -InvalidStart: - ch = 1; - goto Return; -InvalidContinuation1: - ch = 2; - goto Return; -InvalidContinuation2: - ch = 3; - goto Return; -InvalidContinuation3: - ch = 4; - goto Return; -} - -#undef ASCII_CHAR_MASK - - -/* UTF-8 encoder specialized for a Unicode kind to avoid the slow - PyUnicode_READ() macro. Delete some parts of the code depending on the kind: - UCS-1 strings don't need to handle surrogates for example. */ -Py_LOCAL_INLINE(PyObject *) -STRINGLIB(utf8_encoder)(PyObject *unicode, - STRINGLIB_CHAR *data, - Py_ssize_t size, - _Py_error_handler error_handler, - const char *errors) -{ - Py_ssize_t i; /* index into data of next input character */ - char *p; /* next free byte in output buffer */ -#if STRINGLIB_SIZEOF_CHAR > 1 - PyObject *error_handler_obj = NULL; - PyObject *exc = NULL; - PyObject *rep = NULL; -#endif -#if STRINGLIB_SIZEOF_CHAR == 1 - const Py_ssize_t max_char_size = 2; -#elif STRINGLIB_SIZEOF_CHAR == 2 - const Py_ssize_t max_char_size = 3; -#else /* STRINGLIB_SIZEOF_CHAR == 4 */ - const Py_ssize_t max_char_size = 4; -#endif - _PyBytesWriter writer; - - assert(size >= 0); - _PyBytesWriter_Init(&writer); - - if (size > PY_SSIZE_T_MAX / max_char_size) { - /* integer overflow */ - return PyErr_NoMemory(); - } - - p = _PyBytesWriter_Alloc(&writer, size * max_char_size); - if (p == NULL) - return NULL; - - for (i = 0; i < size;) { - Py_UCS4 ch = data[i++]; - - if (ch < 0x80) { - /* Encode ASCII */ - *p++ = (char) ch; - - } - else -#if STRINGLIB_SIZEOF_CHAR > 1 - if (ch < 0x0800) -#endif - { - /* Encode Latin-1 */ - *p++ = (char)(0xc0 | (ch >> 6)); - *p++ = (char)(0x80 | (ch & 0x3f)); - } -#if STRINGLIB_SIZEOF_CHAR > 1 - else if (Py_UNICODE_IS_SURROGATE(ch)) { - Py_ssize_t startpos, endpos, newpos; - Py_ssize_t k; - if (error_handler == _Py_ERROR_UNKNOWN) { - error_handler = _Py_GetErrorHandler(errors); - } - - startpos = i-1; - endpos = startpos+1; - - while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) - endpos++; - - /* Only overallocate the buffer if it's not the last write */ - writer.overallocate = (endpos < size); - - switch (error_handler) - { - case _Py_ERROR_REPLACE: - memset(p, '?', endpos - startpos); - p += (endpos - startpos); - /* fall through */ - case _Py_ERROR_IGNORE: - i += (endpos - startpos - 1); - break; - - case _Py_ERROR_SURROGATEPASS: - for (k=startpos; k<endpos; k++) { - ch = data[k]; - *p++ = (char)(0xe0 | (ch >> 12)); - *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); - *p++ = (char)(0x80 | (ch & 0x3f)); - } - i += (endpos - startpos - 1); - break; - - case _Py_ERROR_BACKSLASHREPLACE: - /* subtract preallocated bytes */ - writer.min_size -= max_char_size * (endpos - startpos); - p = backslashreplace(&writer, p, - unicode, startpos, endpos); - if (p == NULL) - goto error; - i += (endpos - startpos - 1); - break; - - case _Py_ERROR_XMLCHARREFREPLACE: - /* subtract preallocated bytes */ - writer.min_size -= max_char_size * (endpos - startpos); - p = xmlcharrefreplace(&writer, p, - unicode, startpos, endpos); - if (p == NULL) - goto error; - i += (endpos - startpos - 1); - break; - - case _Py_ERROR_SURROGATEESCAPE: - for (k=startpos; k<endpos; k++) { - ch = data[k]; - if (!(0xDC80 <= ch && ch <= 0xDCFF)) - break; - *p++ = (char)(ch & 0xff); - } - if (k >= endpos) { - i += (endpos - startpos - 1); - break; - } - startpos = k; - assert(startpos < endpos); - /* fall through */ - default: - rep = unicode_encode_call_errorhandler( - errors, &error_handler_obj, "utf-8", "surrogates not allowed", - unicode, &exc, startpos, endpos, &newpos); - if (!rep) - goto error; - - /* subtract preallocated bytes */ - writer.min_size -= max_char_size * (newpos - startpos); - - if (PyBytes_Check(rep)) { - p = _PyBytesWriter_WriteBytes(&writer, p, - PyBytes_AS_STRING(rep), - PyBytes_GET_SIZE(rep)); - } - else { - /* rep is unicode */ - if (PyUnicode_READY(rep) < 0) - goto error; - - if (!PyUnicode_IS_ASCII(rep)) { - raise_encode_exception(&exc, "utf-8", unicode, - startpos, endpos, - "surrogates not allowed"); - goto error; - } - - p = _PyBytesWriter_WriteBytes(&writer, p, - PyUnicode_DATA(rep), - PyUnicode_GET_LENGTH(rep)); - } - - if (p == NULL) - goto error; - Py_CLEAR(rep); - - i = newpos; - } - - /* If overallocation was disabled, ensure that it was the last - write. Otherwise, we missed an optimization */ - assert(writer.overallocate || i == size); - } - else -#if STRINGLIB_SIZEOF_CHAR > 2 - if (ch < 0x10000) -#endif - { - *p++ = (char)(0xe0 | (ch >> 12)); - *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); - *p++ = (char)(0x80 | (ch & 0x3f)); - } -#if STRINGLIB_SIZEOF_CHAR > 2 - else /* ch >= 0x10000 */ - { - assert(ch <= MAX_UNICODE); - /* Encode UCS4 Unicode ordinals */ - *p++ = (char)(0xf0 | (ch >> 18)); - *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); - *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); - *p++ = (char)(0x80 | (ch & 0x3f)); - } -#endif /* STRINGLIB_SIZEOF_CHAR > 2 */ -#endif /* STRINGLIB_SIZEOF_CHAR > 1 */ - } - -#if STRINGLIB_SIZEOF_CHAR > 1 - Py_XDECREF(error_handler_obj); - Py_XDECREF(exc); -#endif - return _PyBytesWriter_Finish(&writer, p); - -#if STRINGLIB_SIZEOF_CHAR > 1 - error: - Py_XDECREF(rep); - Py_XDECREF(error_handler_obj); - Py_XDECREF(exc); - _PyBytesWriter_Dealloc(&writer); - return NULL; -#endif -} - -/* The pattern for constructing UCS2-repeated masks. */ -#if SIZEOF_LONG == 8 -# define UCS2_REPEAT_MASK 0x0001000100010001ul -#elif SIZEOF_LONG == 4 -# define UCS2_REPEAT_MASK 0x00010001ul -#else -# error C 'long' size should be either 4 or 8! -#endif - -/* The mask for fast checking. */ -#if STRINGLIB_SIZEOF_CHAR == 1 -/* The mask for fast checking of whether a C 'long' contains a - non-ASCII or non-Latin1 UTF16-encoded characters. */ -# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR)) -#else -/* The mask for fast checking of whether a C 'long' may contain - UTF16-encoded surrogate characters. This is an efficient heuristic, - assuming that non-surrogate characters with a code point >= 0x8000 are - rare in most input. -*/ -# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u) -#endif -/* The mask for fast byte-swapping. */ -#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu) -/* Swap bytes. */ -#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \ - (((value) & STRIPPED_MASK) << 8)) - -Py_LOCAL_INLINE(Py_UCS4) -STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e, - STRINGLIB_CHAR *dest, Py_ssize_t *outpos, - int native_ordering) -{ - Py_UCS4 ch; - const unsigned char *aligned_end = - (const unsigned char *) _Py_ALIGN_DOWN(e, SIZEOF_LONG); - const unsigned char *q = *inptr; - STRINGLIB_CHAR *p = dest + *outpos; - /* Offsets from q for retrieving byte pairs in the right order. */ -#if PY_LITTLE_ENDIAN - int ihi = !!native_ordering, ilo = !native_ordering; -#else - int ihi = !native_ordering, ilo = !!native_ordering; -#endif - --e; - - while (q < e) { - Py_UCS4 ch2; - /* First check for possible aligned read of a C 'long'. Unaligned - reads are more expensive, better to defer to another iteration. */ - if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) { - /* Fast path for runs of in-range non-surrogate chars. */ - const unsigned char *_q = q; - while (_q < aligned_end) { - unsigned long block = * (unsigned long *) _q; - if (native_ordering) { - /* Can use buffer directly */ - if (block & FAST_CHAR_MASK) - break; - } - else { - /* Need to byte-swap */ - if (block & SWAB(FAST_CHAR_MASK)) - break; -#if STRINGLIB_SIZEOF_CHAR == 1 - block >>= 8; -#else - block = SWAB(block); -#endif - } -#if PY_LITTLE_ENDIAN -# if SIZEOF_LONG == 4 - p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); - p[1] = (STRINGLIB_CHAR)(block >> 16); -# elif SIZEOF_LONG == 8 - p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); - p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); - p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); - p[3] = (STRINGLIB_CHAR)(block >> 48); -# endif -#else -# if SIZEOF_LONG == 4 - p[0] = (STRINGLIB_CHAR)(block >> 16); - p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); -# elif SIZEOF_LONG == 8 - p[0] = (STRINGLIB_CHAR)(block >> 48); - p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); - p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); - p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); -# endif -#endif - _q += SIZEOF_LONG; - p += SIZEOF_LONG / 2; - } - q = _q; - if (q >= e) - break; - } - - ch = (q[ihi] << 8) | q[ilo]; - q += 2; - if (!Py_UNICODE_IS_SURROGATE(ch)) { -#if STRINGLIB_SIZEOF_CHAR < 2 - if (ch > STRINGLIB_MAX_CHAR) - /* Out-of-range */ - goto Return; -#endif - *p++ = (STRINGLIB_CHAR)ch; - continue; - } - - /* UTF-16 code pair: */ - if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) - goto IllegalEncoding; - if (q >= e) - goto UnexpectedEnd; - ch2 = (q[ihi] << 8) | q[ilo]; - q += 2; - if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) - goto IllegalSurrogate; - ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); -#if STRINGLIB_SIZEOF_CHAR < 4 - /* Out-of-range */ - goto Return; -#else - *p++ = (STRINGLIB_CHAR)ch; -#endif - } - ch = 0; -Return: - *inptr = q; - *outpos = p - dest; - return ch; -UnexpectedEnd: - ch = 1; - goto Return; -IllegalEncoding: - ch = 2; - goto Return; -IllegalSurrogate: - ch = 3; - goto Return; -} -#undef UCS2_REPEAT_MASK -#undef FAST_CHAR_MASK -#undef STRIPPED_MASK -#undef SWAB - - -#if STRINGLIB_MAX_CHAR >= 0x80 -Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in, - Py_ssize_t len, - unsigned short **outptr, - int native_ordering) -{ - unsigned short *out = *outptr; - const STRINGLIB_CHAR *end = in + len; -#if STRINGLIB_SIZEOF_CHAR == 1 - if (native_ordering) { - const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); - while (in < unrolled_end) { - out[0] = in[0]; - out[1] = in[1]; - out[2] = in[2]; - out[3] = in[3]; - in += 4; out += 4; - } - while (in < end) { - *out++ = *in++; - } - } else { -# define SWAB2(CH) ((CH) << 8) /* high byte is zero */ - const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); - while (in < unrolled_end) { - out[0] = SWAB2(in[0]); - out[1] = SWAB2(in[1]); - out[2] = SWAB2(in[2]); - out[3] = SWAB2(in[3]); - in += 4; out += 4; - } - while (in < end) { - Py_UCS4 ch = *in++; - *out++ = SWAB2((Py_UCS2)ch); - } -#undef SWAB2 - } - *outptr = out; - return len; -#else - if (native_ordering) { -#if STRINGLIB_MAX_CHAR < 0x10000 - const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); - while (in < unrolled_end) { - /* check if any character is a surrogate character */ - if (((in[0] ^ 0xd800) & - (in[1] ^ 0xd800) & - (in[2] ^ 0xd800) & - (in[3] ^ 0xd800) & 0xf800) == 0) - break; - out[0] = in[0]; - out[1] = in[1]; - out[2] = in[2]; - out[3] = in[3]; - in += 4; out += 4; - } -#endif - while (in < end) { - Py_UCS4 ch; - ch = *in++; - if (ch < 0xd800) - *out++ = ch; - else if (ch < 0xe000) - /* reject surrogate characters (U+D800-U+DFFF) */ - goto fail; -#if STRINGLIB_MAX_CHAR >= 0x10000 - else if (ch >= 0x10000) { - out[0] = Py_UNICODE_HIGH_SURROGATE(ch); - out[1] = Py_UNICODE_LOW_SURROGATE(ch); - out += 2; - } -#endif - else - *out++ = ch; - } - } else { -#define SWAB2(CH) (((CH) << 8) | ((CH) >> 8)) -#if STRINGLIB_MAX_CHAR < 0x10000 - const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); - while (in < unrolled_end) { - /* check if any character is a surrogate character */ - if (((in[0] ^ 0xd800) & - (in[1] ^ 0xd800) & - (in[2] ^ 0xd800) & - (in[3] ^ 0xd800) & 0xf800) == 0) - break; - out[0] = SWAB2(in[0]); - out[1] = SWAB2(in[1]); - out[2] = SWAB2(in[2]); - out[3] = SWAB2(in[3]); - in += 4; out += 4; - } -#endif - while (in < end) { - Py_UCS4 ch = *in++; - if (ch < 0xd800) - *out++ = SWAB2((Py_UCS2)ch); - else if (ch < 0xe000) - /* reject surrogate characters (U+D800-U+DFFF) */ - goto fail; -#if STRINGLIB_MAX_CHAR >= 0x10000 - else if (ch >= 0x10000) { - Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch); - Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch); - out[0] = SWAB2(ch1); - out[1] = SWAB2(ch2); - out += 2; - } -#endif - else - *out++ = SWAB2((Py_UCS2)ch); - } -#undef SWAB2 - } - *outptr = out; - return len; - fail: - *outptr = out; - return len - (end - in + 1); -#endif -} - -#if STRINGLIB_SIZEOF_CHAR == 1 -# define SWAB4(CH, tmp) ((CH) << 24) /* high bytes are zero */ -#elif STRINGLIB_SIZEOF_CHAR == 2 -# define SWAB4(CH, tmp) (tmp = (CH), \ - ((tmp & 0x00FFu) << 24) + ((tmp & 0xFF00u) << 8)) - /* high bytes are zero */ -#else -# define SWAB4(CH, tmp) (tmp = (CH), \ - tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \ - ((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu)) -#endif -Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in, - Py_ssize_t len, - PY_UINT32_T **outptr, - int native_ordering) -{ - PY_UINT32_T *out = *outptr; - const STRINGLIB_CHAR *end = in + len; - if (native_ordering) { - const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); - while (in < unrolled_end) { -#if STRINGLIB_SIZEOF_CHAR > 1 - /* check if any character is a surrogate character */ - if (((in[0] ^ 0xd800) & - (in[1] ^ 0xd800) & - (in[2] ^ 0xd800) & - (in[3] ^ 0xd800) & 0xf800) == 0) - break; -#endif - out[0] = in[0]; - out[1] = in[1]; - out[2] = in[2]; - out[3] = in[3]; - in += 4; out += 4; - } - while (in < end) { - Py_UCS4 ch; - ch = *in++; -#if STRINGLIB_SIZEOF_CHAR > 1 - if (Py_UNICODE_IS_SURROGATE(ch)) { - /* reject surrogate characters (U+D800-U+DFFF) */ - goto fail; - } -#endif - *out++ = ch; - } - } else { - const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4); - while (in < unrolled_end) { -#if STRINGLIB_SIZEOF_CHAR > 1 - Py_UCS4 ch1, ch2, ch3, ch4; - /* check if any character is a surrogate character */ - if (((in[0] ^ 0xd800) & - (in[1] ^ 0xd800) & - (in[2] ^ 0xd800) & - (in[3] ^ 0xd800) & 0xf800) == 0) - break; -#endif - out[0] = SWAB4(in[0], ch1); - out[1] = SWAB4(in[1], ch2); - out[2] = SWAB4(in[2], ch3); - out[3] = SWAB4(in[3], ch4); - in += 4; out += 4; - } - while (in < end) { - Py_UCS4 ch = *in++; -#if STRINGLIB_SIZEOF_CHAR > 1 - if (Py_UNICODE_IS_SURROGATE(ch)) { - /* reject surrogate characters (U+D800-U+DFFF) */ - goto fail; - } -#endif - *out++ = SWAB4(ch, ch); - } - } - *outptr = out; - return len; -#if STRINGLIB_SIZEOF_CHAR > 1 - fail: - *outptr = out; - return len - (end - in + 1); -#endif -} -#undef SWAB4 - -#endif diff --git a/Objects/stringlib/count.h b/Objects/stringlib/count.h index f48500b..de34f96 100644 --- a/Objects/stringlib/count.h +++ b/Objects/stringlib/count.h @@ -1,11 +1,14 @@ /* stringlib: count implementation */ +#ifndef STRINGLIB_COUNT_H +#define STRINGLIB_COUNT_H + #ifndef STRINGLIB_FASTSEARCH_H #error must include "stringlib/fastsearch.h" before including this module #endif Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(count)(const STRINGLIB_CHAR* str, Py_ssize_t str_len, +stringlib_count(const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR* sub, Py_ssize_t sub_len, Py_ssize_t maxcount) { @@ -16,7 +19,7 @@ STRINGLIB(count)(const STRINGLIB_CHAR* str, Py_ssize_t str_len, if (sub_len == 0) return (str_len < maxcount) ? str_len + 1 : maxcount; - count = FASTSEARCH(str, str_len, sub, sub_len, maxcount, FAST_COUNT); + count = fastsearch(str, str_len, sub, sub_len, maxcount, FAST_COUNT); if (count < 0) return 0; /* no match */ @@ -24,4 +27,4 @@ STRINGLIB(count)(const STRINGLIB_CHAR* str, Py_ssize_t str_len, return count; } - +#endif diff --git a/Objects/stringlib/ctype.h b/Objects/stringlib/ctype.h index 843cfa2..739cf3d 100644 --- a/Objects/stringlib/ctype.h +++ b/Objects/stringlib/ctype.h @@ -1,53 +1,46 @@ -#if STRINGLIB_IS_UNICODE -# error "ctype.h only compatible with byte-wise strings" -#endif +/* NOTE: this API is -ONLY- for use with single byte character strings. */ +/* Do not use it with Unicode. */ #include "bytes_methods.h" static PyObject* -stringlib_isspace(PyObject *self, PyObject *Py_UNUSED(ignored)) +stringlib_isspace(PyObject *self) { return _Py_bytes_isspace(STRINGLIB_STR(self), STRINGLIB_LEN(self)); } static PyObject* -stringlib_isalpha(PyObject *self, PyObject *Py_UNUSED(ignored)) +stringlib_isalpha(PyObject *self) { return _Py_bytes_isalpha(STRINGLIB_STR(self), STRINGLIB_LEN(self)); } static PyObject* -stringlib_isalnum(PyObject *self, PyObject *Py_UNUSED(ignored)) +stringlib_isalnum(PyObject *self) { return _Py_bytes_isalnum(STRINGLIB_STR(self), STRINGLIB_LEN(self)); } static PyObject* -stringlib_isascii(PyObject *self, PyObject *Py_UNUSED(ignored)) -{ - return _Py_bytes_isascii(STRINGLIB_STR(self), STRINGLIB_LEN(self)); -} - -static PyObject* -stringlib_isdigit(PyObject *self, PyObject *Py_UNUSED(ignored)) +stringlib_isdigit(PyObject *self) { return _Py_bytes_isdigit(STRINGLIB_STR(self), STRINGLIB_LEN(self)); } static PyObject* -stringlib_islower(PyObject *self, PyObject *Py_UNUSED(ignored)) +stringlib_islower(PyObject *self) { return _Py_bytes_islower(STRINGLIB_STR(self), STRINGLIB_LEN(self)); } static PyObject* -stringlib_isupper(PyObject *self, PyObject *Py_UNUSED(ignored)) +stringlib_isupper(PyObject *self) { return _Py_bytes_isupper(STRINGLIB_STR(self), STRINGLIB_LEN(self)); } static PyObject* -stringlib_istitle(PyObject *self, PyObject *Py_UNUSED(ignored)) +stringlib_istitle(PyObject *self) { return _Py_bytes_istitle(STRINGLIB_STR(self), STRINGLIB_LEN(self)); } @@ -56,7 +49,7 @@ stringlib_istitle(PyObject *self, PyObject *Py_UNUSED(ignored)) /* functions that return a new object partially translated by ctype funcs: */ static PyObject* -stringlib_lower(PyObject *self, PyObject *Py_UNUSED(ignored)) +stringlib_lower(PyObject *self) { PyObject* newobj; newobj = STRINGLIB_NEW(NULL, STRINGLIB_LEN(self)); @@ -68,7 +61,7 @@ stringlib_lower(PyObject *self, PyObject *Py_UNUSED(ignored)) } static PyObject* -stringlib_upper(PyObject *self, PyObject *Py_UNUSED(ignored)) +stringlib_upper(PyObject *self) { PyObject* newobj; newobj = STRINGLIB_NEW(NULL, STRINGLIB_LEN(self)); @@ -80,7 +73,7 @@ stringlib_upper(PyObject *self, PyObject *Py_UNUSED(ignored)) } static PyObject* -stringlib_title(PyObject *self, PyObject *Py_UNUSED(ignored)) +stringlib_title(PyObject *self) { PyObject* newobj; newobj = STRINGLIB_NEW(NULL, STRINGLIB_LEN(self)); @@ -92,7 +85,7 @@ stringlib_title(PyObject *self, PyObject *Py_UNUSED(ignored)) } static PyObject* -stringlib_capitalize(PyObject *self, PyObject *Py_UNUSED(ignored)) +stringlib_capitalize(PyObject *self) { PyObject* newobj; newobj = STRINGLIB_NEW(NULL, STRINGLIB_LEN(self)); @@ -104,7 +97,7 @@ stringlib_capitalize(PyObject *self, PyObject *Py_UNUSED(ignored)) } static PyObject* -stringlib_swapcase(PyObject *self, PyObject *Py_UNUSED(ignored)) +stringlib_swapcase(PyObject *self) { PyObject* newobj; newobj = STRINGLIB_NEW(NULL, STRINGLIB_LEN(self)); diff --git a/Objects/stringlib/eq.h b/Objects/stringlib/eq.h deleted file mode 100644 index ff22f91..0000000 --- a/Objects/stringlib/eq.h +++ /dev/null @@ -1,24 +0,0 @@ -/* Fast unicode equal function optimized for dictobject.c and setobject.c */ - -/* Return 1 if two unicode objects are equal, 0 if not. - * unicode_eq() is called when the hash of two unicode objects is equal. - */ -Py_LOCAL_INLINE(int) -unicode_eq(PyObject *aa, PyObject *bb) -{ - PyUnicodeObject *a = (PyUnicodeObject *)aa; - PyUnicodeObject *b = (PyUnicodeObject *)bb; - - if (PyUnicode_READY(a) == -1 || PyUnicode_READY(b) == -1) { - Py_UNREACHABLE(); - } - - if (PyUnicode_GET_LENGTH(a) != PyUnicode_GET_LENGTH(b)) - return 0; - if (PyUnicode_GET_LENGTH(a) == 0) - return 1; - if (PyUnicode_KIND(a) != PyUnicode_KIND(b)) - return 0; - return memcmp(PyUnicode_1BYTE_DATA(a), PyUnicode_1BYTE_DATA(b), - PyUnicode_GET_LENGTH(a) * PyUnicode_KIND(a)) == 0; -} diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 56a4467..e231c58 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -1,5 +1,6 @@ /* stringlib: fastsearch implementation */ +#ifndef STRINGLIB_FASTSEARCH_H #define STRINGLIB_FASTSEARCH_H /* fast search/count implementation, based on a mix between boyer- @@ -32,136 +33,8 @@ #define STRINGLIB_BLOOM(mask, ch) \ ((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) -#if STRINGLIB_SIZEOF_CHAR == 1 -# define MEMCHR_CUT_OFF 15 -#else -# define MEMCHR_CUT_OFF 40 -#endif - Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(find_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) -{ - const STRINGLIB_CHAR *p, *e; - - p = s; - e = s + n; - if (n > MEMCHR_CUT_OFF) { -#if STRINGLIB_SIZEOF_CHAR == 1 - p = memchr(s, ch, n); - if (p != NULL) - return (p - s); - return -1; -#else - /* use memchr if we can choose a needle without too many likely - false positives */ - const STRINGLIB_CHAR *s1, *e1; - unsigned char needle = ch & 0xff; - /* If looking for a multiple of 256, we'd have too - many false positives looking for the '\0' byte in UCS2 - and UCS4 representations. */ - if (needle != 0) { - do { - void *candidate = memchr(p, needle, - (e - p) * sizeof(STRINGLIB_CHAR)); - if (candidate == NULL) - return -1; - s1 = p; - p = (const STRINGLIB_CHAR *) - _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR)); - if (*p == ch) - return (p - s); - /* False positive */ - p++; - if (p - s1 > MEMCHR_CUT_OFF) - continue; - if (e - p <= MEMCHR_CUT_OFF) - break; - e1 = p + MEMCHR_CUT_OFF; - while (p != e1) { - if (*p == ch) - return (p - s); - p++; - } - } - while (e - p > MEMCHR_CUT_OFF); - } -#endif - } - while (p < e) { - if (*p == ch) - return (p - s); - p++; - } - return -1; -} - -Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) -{ - const STRINGLIB_CHAR *p; -#ifdef HAVE_MEMRCHR - /* memrchr() is a GNU extension, available since glibc 2.1.91. - it doesn't seem as optimized as memchr(), but is still quite - faster than our hand-written loop below */ - - if (n > MEMCHR_CUT_OFF) { -#if STRINGLIB_SIZEOF_CHAR == 1 - p = memrchr(s, ch, n); - if (p != NULL) - return (p - s); - return -1; -#else - /* use memrchr if we can choose a needle without too many likely - false positives */ - const STRINGLIB_CHAR *s1; - Py_ssize_t n1; - unsigned char needle = ch & 0xff; - /* If looking for a multiple of 256, we'd have too - many false positives looking for the '\0' byte in UCS2 - and UCS4 representations. */ - if (needle != 0) { - do { - void *candidate = memrchr(s, needle, - n * sizeof(STRINGLIB_CHAR)); - if (candidate == NULL) - return -1; - n1 = n; - p = (const STRINGLIB_CHAR *) - _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR)); - n = p - s; - if (*p == ch) - return n; - /* False positive */ - if (n1 - n > MEMCHR_CUT_OFF) - continue; - if (n <= MEMCHR_CUT_OFF) - break; - s1 = p - MEMCHR_CUT_OFF; - while (p > s1) { - p--; - if (*p == ch) - return (p - s); - } - n = p - s; - } - while (n > MEMCHR_CUT_OFF); - } -#endif - } -#endif /* HAVE_MEMRCHR */ - p = s + n; - while (p > s) { - p--; - if (*p == ch) - return (p - s); - } - return -1; -} - -#undef MEMCHR_CUT_OFF - -Py_LOCAL_INLINE(Py_ssize_t) -FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, +fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n, const STRINGLIB_CHAR* p, Py_ssize_t m, Py_ssize_t maxcount, int mode) { @@ -179,11 +52,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, if (m <= 0) return -1; /* use special case for 1-character strings */ - if (mode == FAST_SEARCH) - return STRINGLIB(find_char)(s, n, p[0]); - else if (mode == FAST_RSEARCH) - return STRINGLIB(rfind_char)(s, n, p[0]); - else { /* FAST_COUNT */ + if (mode == FAST_COUNT) { for (i = 0; i < n; i++) if (s[i] == p[0]) { count++; @@ -191,7 +60,16 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, return maxcount; } return count; + } else if (mode == FAST_SEARCH) { + for (i = 0; i < n; i++) + if (s[i] == p[0]) + return i; + } else { /* FAST_RSEARCH */ + for (i = n - 1; i > -1; i--) + if (s[i] == p[0]) + return i; } + return -1; } mlast = m - 1; @@ -199,8 +77,6 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, mask = 0; if (mode != FAST_RSEARCH) { - const STRINGLIB_CHAR *ss = s + m - 1; - const STRINGLIB_CHAR *pp = p + m - 1; /* create compressed boyer-moore delta 1 table */ @@ -215,7 +91,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, for (i = 0; i <= w; i++) { /* note: using mlast in the skip path slows things down on x86 */ - if (ss[i] == pp[0]) { + if (s[i+m-1] == p[m-1]) { /* candidate match */ for (j = 0; j < mlast; j++) if (s[i+j] != p[j]) @@ -231,13 +107,13 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, continue; } /* miss: check if next character is part of pattern */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) + if (!STRINGLIB_BLOOM(mask, s[i+m])) i = i + m; else i = i + skip; } else { /* skip: check if next character is part of pattern */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) + if (!STRINGLIB_BLOOM(mask, s[i+m])) i = i + m; } } @@ -281,3 +157,4 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, return count; } +#endif diff --git a/Objects/stringlib/find.h b/Objects/stringlib/find.h index 509b929..ce615dc 100644 --- a/Objects/stringlib/find.h +++ b/Objects/stringlib/find.h @@ -1,21 +1,25 @@ /* stringlib: find/index implementation */ +#ifndef STRINGLIB_FIND_H +#define STRINGLIB_FIND_H + #ifndef STRINGLIB_FASTSEARCH_H #error must include "stringlib/fastsearch.h" before including this module #endif Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(find)(const STRINGLIB_CHAR* str, Py_ssize_t str_len, +stringlib_find(const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR* sub, Py_ssize_t sub_len, Py_ssize_t offset) { Py_ssize_t pos; - assert(str_len >= 0); + if (str_len < 0) + return -1; if (sub_len == 0) return offset; - pos = FASTSEARCH(str, str_len, sub, sub_len, -1, FAST_SEARCH); + pos = fastsearch(str, str_len, sub, sub_len, -1, FAST_SEARCH); if (pos >= 0) pos += offset; @@ -24,17 +28,18 @@ STRINGLIB(find)(const STRINGLIB_CHAR* str, Py_ssize_t str_len, } Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(rfind)(const STRINGLIB_CHAR* str, Py_ssize_t str_len, +stringlib_rfind(const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR* sub, Py_ssize_t sub_len, Py_ssize_t offset) { Py_ssize_t pos; - assert(str_len >= 0); + if (str_len < 0) + return -1; if (sub_len == 0) return str_len + offset; - pos = FASTSEARCH(str, str_len, sub, sub_len, -1, FAST_RSEARCH); + pos = fastsearch(str, str_len, sub, sub_len, -1, FAST_RSEARCH); if (pos >= 0) pos += offset; @@ -42,28 +47,45 @@ STRINGLIB(rfind)(const STRINGLIB_CHAR* str, Py_ssize_t str_len, return pos; } +/* helper macro to fixup start/end slice values */ +#define ADJUST_INDICES(start, end, len) \ + if (end > len) \ + end = len; \ + else if (end < 0) { \ + end += len; \ + if (end < 0) \ + end = 0; \ + } \ + if (start < 0) { \ + start += len; \ + if (start < 0) \ + start = 0; \ + } + Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(find_slice)(const STRINGLIB_CHAR* str, Py_ssize_t str_len, +stringlib_find_slice(const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR* sub, Py_ssize_t sub_len, Py_ssize_t start, Py_ssize_t end) { - return STRINGLIB(find)(str + start, end - start, sub, sub_len, start); + ADJUST_INDICES(start, end, str_len); + return stringlib_find(str + start, end - start, sub, sub_len, start); } Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(rfind_slice)(const STRINGLIB_CHAR* str, Py_ssize_t str_len, +stringlib_rfind_slice(const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR* sub, Py_ssize_t sub_len, Py_ssize_t start, Py_ssize_t end) { - return STRINGLIB(rfind)(str + start, end - start, sub, sub_len, start); + ADJUST_INDICES(start, end, str_len); + return stringlib_rfind(str + start, end - start, sub, sub_len, start); } #ifdef STRINGLIB_WANT_CONTAINS_OBJ Py_LOCAL_INLINE(int) -STRINGLIB(contains_obj)(PyObject* str, PyObject* sub) +stringlib_contains_obj(PyObject* str, PyObject* sub) { - return STRINGLIB(find)( + return stringlib_find( STRINGLIB_STR(str), STRINGLIB_LEN(str), STRINGLIB_STR(sub), STRINGLIB_LEN(sub), 0 ) != -1; @@ -76,14 +98,14 @@ This function is a helper for the "find" family (find, rfind, index, rindex) and for count, startswith and endswith, because they all have the same behaviour for the arguments. -It does not touch the variables received until it knows everything +It does not touch the variables received until it knows everything is ok. */ #define FORMAT_BUFFER_SIZE 50 Py_LOCAL_INLINE(int) -STRINGLIB(parse_args_finds)(const char * function_name, PyObject *args, +stringlib_parse_args_finds(const char * function_name, PyObject *args, PyObject **subobj, Py_ssize_t *start, Py_ssize_t *end) { @@ -117,3 +139,37 @@ STRINGLIB(parse_args_finds)(const char * function_name, PyObject *args, } #undef FORMAT_BUFFER_SIZE + +#if STRINGLIB_IS_UNICODE + +/* +Wraps stringlib_parse_args_finds() and additionally ensures that the +first argument is a unicode object. + +Note that we receive a pointer to the pointer of the substring object, +so when we create that object in this function we don't DECREF it, +because it continues living in the caller functions (those functions, +after finishing using the substring, must DECREF it). +*/ + +Py_LOCAL_INLINE(int) +stringlib_parse_args_finds_unicode(const char * function_name, PyObject *args, + PyUnicodeObject **substring, + Py_ssize_t *start, Py_ssize_t *end) +{ + PyObject *tmp_substring; + + if(stringlib_parse_args_finds(function_name, args, &tmp_substring, + start, end)) { + tmp_substring = PyUnicode_FromObject(tmp_substring); + if (!tmp_substring) + return 0; + *substring = (PyUnicodeObject *)tmp_substring; + return 1; + } + return 0; +} + +#endif /* STRINGLIB_IS_UNICODE */ + +#endif /* STRINGLIB_FIND_H */ diff --git a/Objects/stringlib/find_max_char.h b/Objects/stringlib/find_max_char.h deleted file mode 100644 index 8ccbc30..0000000 --- a/Objects/stringlib/find_max_char.h +++ /dev/null @@ -1,134 +0,0 @@ -/* Finding the optimal width of unicode characters in a buffer */ - -#if !STRINGLIB_IS_UNICODE -# error "find_max_char.h is specific to Unicode" -#endif - -/* Mask to quickly check whether a C 'long' contains a - non-ASCII, UTF8-encoded char. */ -#if (SIZEOF_LONG == 8) -# define UCS1_ASCII_CHAR_MASK 0x8080808080808080UL -#elif (SIZEOF_LONG == 4) -# define UCS1_ASCII_CHAR_MASK 0x80808080UL -#else -# error C 'long' size should be either 4 or 8! -#endif - -#if STRINGLIB_SIZEOF_CHAR == 1 - -Py_LOCAL_INLINE(Py_UCS4) -STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) -{ - const unsigned char *p = (const unsigned char *) begin; - const unsigned char *aligned_end = - (const unsigned char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); - - while (p < end) { - if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { - /* Help register allocation */ - const unsigned char *_p = p; - while (_p < aligned_end) { - unsigned long value = *(unsigned long *) _p; - if (value & UCS1_ASCII_CHAR_MASK) - return 255; - _p += SIZEOF_LONG; - } - p = _p; - if (p == end) - break; - } - if (*p++ & 0x80) - return 255; - } - return 127; -} - -#undef ASCII_CHAR_MASK - -#else /* STRINGLIB_SIZEOF_CHAR == 1 */ - -#define MASK_ASCII 0xFFFFFF80 -#define MASK_UCS1 0xFFFFFF00 -#define MASK_UCS2 0xFFFF0000 - -#define MAX_CHAR_ASCII 0x7f -#define MAX_CHAR_UCS1 0xff -#define MAX_CHAR_UCS2 0xffff -#define MAX_CHAR_UCS4 0x10ffff - -Py_LOCAL_INLINE(Py_UCS4) -STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) -{ -#if STRINGLIB_SIZEOF_CHAR == 2 - const Py_UCS4 mask_limit = MASK_UCS1; - const Py_UCS4 max_char_limit = MAX_CHAR_UCS2; -#elif STRINGLIB_SIZEOF_CHAR == 4 - const Py_UCS4 mask_limit = MASK_UCS2; - const Py_UCS4 max_char_limit = MAX_CHAR_UCS4; -#else -#error Invalid STRINGLIB_SIZEOF_CHAR (must be 1, 2 or 4) -#endif - Py_UCS4 mask; - Py_ssize_t n = end - begin; - const STRINGLIB_CHAR *p = begin; - const STRINGLIB_CHAR *unrolled_end = begin + _Py_SIZE_ROUND_DOWN(n, 4); - Py_UCS4 max_char; - - max_char = MAX_CHAR_ASCII; - mask = MASK_ASCII; - while (p < unrolled_end) { - STRINGLIB_CHAR bits = p[0] | p[1] | p[2] | p[3]; - if (bits & mask) { - if (mask == mask_limit) { - /* Limit reached */ - return max_char_limit; - } - if (mask == MASK_ASCII) { - max_char = MAX_CHAR_UCS1; - mask = MASK_UCS1; - } - else { - /* mask can't be MASK_UCS2 because of mask_limit above */ - assert(mask == MASK_UCS1); - max_char = MAX_CHAR_UCS2; - mask = MASK_UCS2; - } - /* We check the new mask on the same chars in the next iteration */ - continue; - } - p += 4; - } - while (p < end) { - if (p[0] & mask) { - if (mask == mask_limit) { - /* Limit reached */ - return max_char_limit; - } - if (mask == MASK_ASCII) { - max_char = MAX_CHAR_UCS1; - mask = MASK_UCS1; - } - else { - /* mask can't be MASK_UCS2 because of mask_limit above */ - assert(mask == MASK_UCS1); - max_char = MAX_CHAR_UCS2; - mask = MASK_UCS2; - } - /* We check the new mask on the same chars in the next iteration */ - continue; - } - p++; - } - return max_char; -} - -#undef MASK_ASCII -#undef MASK_UCS1 -#undef MASK_UCS2 -#undef MAX_CHAR_ASCII -#undef MAX_CHAR_UCS1 -#undef MAX_CHAR_UCS2 -#undef MAX_CHAR_UCS4 - -#endif /* STRINGLIB_SIZEOF_CHAR == 1 */ - diff --git a/Objects/stringlib/formatter.h b/Objects/stringlib/formatter.h new file mode 100644 index 0000000..70f574c --- /dev/null +++ b/Objects/stringlib/formatter.h @@ -0,0 +1,1547 @@ +/* implements the string, long, and float formatters. that is, + string.__format__, etc. */ + +#include <locale.h> + +/* Before including this, you must include either: + stringlib/unicodedefs.h + stringlib/stringdefs.h + + Also, you should define the names: + FORMAT_STRING + FORMAT_LONG + FORMAT_FLOAT + FORMAT_COMPLEX + to be whatever you want the public names of these functions to + be. These are the only non-static functions defined here. +*/ + +/* Raises an exception about an unknown presentation type for this + * type. */ + +static void +unknown_presentation_type(STRINGLIB_CHAR presentation_type, + const char* type_name) +{ +#if STRINGLIB_IS_UNICODE + /* If STRINGLIB_CHAR is Py_UNICODE, %c might be out-of-range, + hence the two cases. If it is char, gcc complains that the + condition below is always true, hence the ifdef. */ + if (presentation_type > 32 && presentation_type < 128) +#endif + PyErr_Format(PyExc_ValueError, + "Unknown format code '%c' " + "for object of type '%.200s'", + (char)presentation_type, + type_name); +#if STRINGLIB_IS_UNICODE + else + PyErr_Format(PyExc_ValueError, + "Unknown format code '\\x%x' " + "for object of type '%.200s'", + (unsigned int)presentation_type, + type_name); +#endif +} + +static void +invalid_comma_type(STRINGLIB_CHAR presentation_type) +{ +#if STRINGLIB_IS_UNICODE + /* See comment in unknown_presentation_type */ + if (presentation_type > 32 && presentation_type < 128) +#endif + PyErr_Format(PyExc_ValueError, + "Cannot specify ',' with '%c'.", + (char)presentation_type); +#if STRINGLIB_IS_UNICODE + else + PyErr_Format(PyExc_ValueError, + "Cannot specify ',' with '\\x%x'.", + (unsigned int)presentation_type); +#endif +} + +/* + get_integer consumes 0 or more decimal digit characters from an + input string, updates *result with the corresponding positive + integer, and returns the number of digits consumed. + + returns -1 on error. +*/ +static int +get_integer(STRINGLIB_CHAR **ptr, STRINGLIB_CHAR *end, + Py_ssize_t *result) +{ + Py_ssize_t accumulator, digitval; + int numdigits; + accumulator = numdigits = 0; + for (;;(*ptr)++, numdigits++) { + if (*ptr >= end) + break; + digitval = STRINGLIB_TODECIMAL(**ptr); + if (digitval < 0) + break; + /* + Detect possible overflow before it happens: + + accumulator * 10 + digitval > PY_SSIZE_T_MAX if and only if + accumulator > (PY_SSIZE_T_MAX - digitval) / 10. + */ + if (accumulator > (PY_SSIZE_T_MAX - digitval) / 10) { + PyErr_Format(PyExc_ValueError, + "Too many decimal digits in format string"); + return -1; + } + accumulator = accumulator * 10 + digitval; + } + *result = accumulator; + return numdigits; +} + +/************************************************************************/ +/*********** standard format specifier parsing **************************/ +/************************************************************************/ + +/* returns true if this character is a specifier alignment token */ +Py_LOCAL_INLINE(int) +is_alignment_token(STRINGLIB_CHAR c) +{ + switch (c) { + case '<': case '>': case '=': case '^': + return 1; + default: + return 0; + } +} + +/* returns true if this character is a sign element */ +Py_LOCAL_INLINE(int) +is_sign_element(STRINGLIB_CHAR c) +{ + switch (c) { + case ' ': case '+': case '-': + return 1; + default: + return 0; + } +} + + +typedef struct { + STRINGLIB_CHAR fill_char; + STRINGLIB_CHAR align; + int alternate; + STRINGLIB_CHAR sign; + Py_ssize_t width; + int thousands_separators; + Py_ssize_t precision; + STRINGLIB_CHAR type; +} InternalFormatSpec; + + +#if 0 +/* Occasionally useful for debugging. Should normally be commented out. */ +static void +DEBUG_PRINT_FORMAT_SPEC(InternalFormatSpec *format) +{ + printf("internal format spec: fill_char %d\n", format->fill_char); + printf("internal format spec: align %d\n", format->align); + printf("internal format spec: alternate %d\n", format->alternate); + printf("internal format spec: sign %d\n", format->sign); + printf("internal format spec: width %zd\n", format->width); + printf("internal format spec: thousands_separators %d\n", + format->thousands_separators); + printf("internal format spec: precision %zd\n", format->precision); + printf("internal format spec: type %c\n", format->type); + printf("\n"); +} +#endif + + +/* + ptr points to the start of the format_spec, end points just past its end. + fills in format with the parsed information. + returns 1 on success, 0 on failure. + if failure, sets the exception +*/ +static int +parse_internal_render_format_spec(STRINGLIB_CHAR *format_spec, + Py_ssize_t format_spec_len, + InternalFormatSpec *format, + char default_type, + char default_align) +{ + STRINGLIB_CHAR *ptr = format_spec; + STRINGLIB_CHAR *end = format_spec + format_spec_len; + + /* end-ptr is used throughout this code to specify the length of + the input string */ + + Py_ssize_t consumed; + int align_specified = 0; + int fill_char_specified = 0; + + format->fill_char = ' '; + format->align = default_align; + format->alternate = 0; + format->sign = '\0'; + format->width = -1; + format->thousands_separators = 0; + format->precision = -1; + format->type = default_type; + + /* If the second char is an alignment token, + then parse the fill char */ + if (end-ptr >= 2 && is_alignment_token(ptr[1])) { + format->align = ptr[1]; + format->fill_char = ptr[0]; + fill_char_specified = 1; + align_specified = 1; + ptr += 2; + } + else if (end-ptr >= 1 && is_alignment_token(ptr[0])) { + format->align = ptr[0]; + align_specified = 1; + ++ptr; + } + + /* Parse the various sign options */ + if (end-ptr >= 1 && is_sign_element(ptr[0])) { + format->sign = ptr[0]; + ++ptr; + } + + /* If the next character is #, we're in alternate mode. This only + applies to integers. */ + if (end-ptr >= 1 && ptr[0] == '#') { + format->alternate = 1; + ++ptr; + } + + /* The special case for 0-padding (backwards compat) */ + if (!fill_char_specified && end-ptr >= 1 && ptr[0] == '0') { + format->fill_char = '0'; + if (!align_specified) { + format->align = '='; + } + ++ptr; + } + + consumed = get_integer(&ptr, end, &format->width); + if (consumed == -1) + /* Overflow error. Exception already set. */ + return 0; + + /* If consumed is 0, we didn't consume any characters for the + width. In that case, reset the width to -1, because + get_integer() will have set it to zero. -1 is how we record + that the width wasn't specified. */ + if (consumed == 0) + format->width = -1; + + /* Comma signifies add thousands separators */ + if (end-ptr && ptr[0] == ',') { + format->thousands_separators = 1; + ++ptr; + } + + /* Parse field precision */ + if (end-ptr && ptr[0] == '.') { + ++ptr; + + consumed = get_integer(&ptr, end, &format->precision); + if (consumed == -1) + /* Overflow error. Exception already set. */ + return 0; + + /* Not having a precision after a dot is an error. */ + if (consumed == 0) { + PyErr_Format(PyExc_ValueError, + "Format specifier missing precision"); + return 0; + } + + } + + /* Finally, parse the type field. */ + + if (end-ptr > 1) { + /* More than one char remain, invalid conversion spec. */ + PyErr_Format(PyExc_ValueError, "Invalid conversion specification"); + return 0; + } + + if (end-ptr == 1) { + format->type = ptr[0]; + ++ptr; + } + + /* Do as much validating as we can, just by looking at the format + specifier. Do not take into account what type of formatting + we're doing (int, float, string). */ + + if (format->thousands_separators) { + switch (format->type) { + case 'd': + case 'e': + case 'f': + case 'g': + case 'E': + case 'G': + case '%': + case 'F': + case '\0': + /* These are allowed. See PEP 378.*/ + break; + default: + invalid_comma_type(format->type); + return 0; + } + } + + return 1; +} + +/* Calculate the padding needed. */ +static void +calc_padding(Py_ssize_t nchars, Py_ssize_t width, STRINGLIB_CHAR align, + Py_ssize_t *n_lpadding, Py_ssize_t *n_rpadding, + Py_ssize_t *n_total) +{ + if (width >= 0) { + if (nchars > width) + *n_total = nchars; + else + *n_total = width; + } + else { + /* not specified, use all of the chars and no more */ + *n_total = nchars; + } + + /* Figure out how much leading space we need, based on the + aligning */ + if (align == '>') + *n_lpadding = *n_total - nchars; + else if (align == '^') + *n_lpadding = (*n_total - nchars) / 2; + else if (align == '<' || align == '=') + *n_lpadding = 0; + else { + /* We should never have an unspecified alignment. */ + *n_lpadding = 0; + assert(0); + } + + *n_rpadding = *n_total - nchars - *n_lpadding; +} + +/* Do the padding, and return a pointer to where the caller-supplied + content goes. */ +static STRINGLIB_CHAR * +fill_padding(STRINGLIB_CHAR *p, Py_ssize_t nchars, STRINGLIB_CHAR fill_char, + Py_ssize_t n_lpadding, Py_ssize_t n_rpadding) +{ + /* Pad on left. */ + if (n_lpadding) + STRINGLIB_FILL(p, fill_char, n_lpadding); + + /* Pad on right. */ + if (n_rpadding) + STRINGLIB_FILL(p + nchars + n_lpadding, fill_char, n_rpadding); + + /* Pointer to the user content. */ + return p + n_lpadding; +} + +#if defined FORMAT_FLOAT || defined FORMAT_LONG || defined FORMAT_COMPLEX +/************************************************************************/ +/*********** common routines for numeric formatting *********************/ +/************************************************************************/ + +/* Locale type codes. */ +#define LT_CURRENT_LOCALE 0 +#define LT_DEFAULT_LOCALE 1 +#define LT_NO_LOCALE 2 + +/* Locale info needed for formatting integers and the part of floats + before and including the decimal. Note that locales only support + 8-bit chars, not unicode. */ +typedef struct { + char *decimal_point; + char *thousands_sep; + char *grouping; +} LocaleInfo; + +/* describes the layout for an integer, see the comment in + calc_number_widths() for details */ +typedef struct { + Py_ssize_t n_lpadding; + Py_ssize_t n_prefix; + Py_ssize_t n_spadding; + Py_ssize_t n_rpadding; + char sign; + Py_ssize_t n_sign; /* number of digits needed for sign (0/1) */ + Py_ssize_t n_grouped_digits; /* Space taken up by the digits, including + any grouping chars. */ + Py_ssize_t n_decimal; /* 0 if only an integer */ + Py_ssize_t n_remainder; /* Digits in decimal and/or exponent part, + excluding the decimal itself, if + present. */ + + /* These 2 are not the widths of fields, but are needed by + STRINGLIB_GROUPING. */ + Py_ssize_t n_digits; /* The number of digits before a decimal + or exponent. */ + Py_ssize_t n_min_width; /* The min_width we used when we computed + the n_grouped_digits width. */ +} NumberFieldWidths; + + +/* Given a number of the form: + digits[remainder] + where ptr points to the start and end points to the end, find where + the integer part ends. This could be a decimal, an exponent, both, + or neither. + If a decimal point is present, set *has_decimal and increment + remainder beyond it. + Results are undefined (but shouldn't crash) for improperly + formatted strings. +*/ +static void +parse_number(STRINGLIB_CHAR *ptr, Py_ssize_t len, + Py_ssize_t *n_remainder, int *has_decimal) +{ + STRINGLIB_CHAR *end = ptr + len; + STRINGLIB_CHAR *remainder; + + while (ptr<end && isdigit(*ptr)) + ++ptr; + remainder = ptr; + + /* Does remainder start with a decimal point? */ + *has_decimal = ptr<end && *remainder == '.'; + + /* Skip the decimal point. */ + if (*has_decimal) + remainder++; + + *n_remainder = end - remainder; +} + +/* not all fields of format are used. for example, precision is + unused. should this take discrete params in order to be more clear + about what it does? or is passing a single format parameter easier + and more efficient enough to justify a little obfuscation? */ +static Py_ssize_t +calc_number_widths(NumberFieldWidths *spec, Py_ssize_t n_prefix, + STRINGLIB_CHAR sign_char, STRINGLIB_CHAR *number, + Py_ssize_t n_number, Py_ssize_t n_remainder, + int has_decimal, const LocaleInfo *locale, + const InternalFormatSpec *format) +{ + Py_ssize_t n_non_digit_non_padding; + Py_ssize_t n_padding; + + spec->n_digits = n_number - n_remainder - (has_decimal?1:0); + spec->n_lpadding = 0; + spec->n_prefix = n_prefix; + spec->n_decimal = has_decimal ? strlen(locale->decimal_point) : 0; + spec->n_remainder = n_remainder; + spec->n_spadding = 0; + spec->n_rpadding = 0; + spec->sign = '\0'; + spec->n_sign = 0; + + /* the output will look like: + | | + | <lpadding> <sign> <prefix> <spadding> <grouped_digits> <decimal> <remainder> <rpadding> | + | | + + sign is computed from format->sign and the actual + sign of the number + + prefix is given (it's for the '0x' prefix) + + digits is already known + + the total width is either given, or computed from the + actual digits + + only one of lpadding, spadding, and rpadding can be non-zero, + and it's calculated from the width and other fields + */ + + /* compute the various parts we're going to write */ + switch (format->sign) { + case '+': + /* always put a + or - */ + spec->n_sign = 1; + spec->sign = (sign_char == '-' ? '-' : '+'); + break; + case ' ': + spec->n_sign = 1; + spec->sign = (sign_char == '-' ? '-' : ' '); + break; + default: + /* Not specified, or the default (-) */ + if (sign_char == '-') { + spec->n_sign = 1; + spec->sign = '-'; + } + } + + /* The number of chars used for non-digits and non-padding. */ + n_non_digit_non_padding = spec->n_sign + spec->n_prefix + spec->n_decimal + + spec->n_remainder; + + /* min_width can go negative, that's okay. format->width == -1 means + we don't care. */ + if (format->fill_char == '0' && format->align == '=') + spec->n_min_width = format->width - n_non_digit_non_padding; + else + spec->n_min_width = 0; + + if (spec->n_digits == 0) + /* This case only occurs when using 'c' formatting, we need + to special case it because the grouping code always wants + to have at least one character. */ + spec->n_grouped_digits = 0; + else + spec->n_grouped_digits = STRINGLIB_GROUPING(NULL, 0, NULL, + spec->n_digits, + spec->n_min_width, + locale->grouping, + locale->thousands_sep); + + /* Given the desired width and the total of digit and non-digit + space we consume, see if we need any padding. format->width can + be negative (meaning no padding), but this code still works in + that case. */ + n_padding = format->width - + (n_non_digit_non_padding + spec->n_grouped_digits); + if (n_padding > 0) { + /* Some padding is needed. Determine if it's left, space, or right. */ + switch (format->align) { + case '<': + spec->n_rpadding = n_padding; + break; + case '^': + spec->n_lpadding = n_padding / 2; + spec->n_rpadding = n_padding - spec->n_lpadding; + break; + case '=': + spec->n_spadding = n_padding; + break; + case '>': + spec->n_lpadding = n_padding; + break; + default: + /* Shouldn't get here, but treat it as '>' */ + spec->n_lpadding = n_padding; + assert(0); + break; + } + } + return spec->n_lpadding + spec->n_sign + spec->n_prefix + + spec->n_spadding + spec->n_grouped_digits + spec->n_decimal + + spec->n_remainder + spec->n_rpadding; +} + +/* Fill in the digit parts of a numbers's string representation, + as determined in calc_number_widths(). + No error checking, since we know the buffer is the correct size. */ +static void +fill_number(STRINGLIB_CHAR *buf, const NumberFieldWidths *spec, + STRINGLIB_CHAR *digits, Py_ssize_t n_digits, + STRINGLIB_CHAR *prefix, STRINGLIB_CHAR fill_char, + LocaleInfo *locale, int toupper) +{ + /* Used to keep track of digits, decimal, and remainder. */ + STRINGLIB_CHAR *p = digits; + +#ifndef NDEBUG + Py_ssize_t r; +#endif + + if (spec->n_lpadding) { + STRINGLIB_FILL(buf, fill_char, spec->n_lpadding); + buf += spec->n_lpadding; + } + if (spec->n_sign == 1) { + *buf++ = spec->sign; + } + if (spec->n_prefix) { + memmove(buf, + prefix, + spec->n_prefix * sizeof(STRINGLIB_CHAR)); + if (toupper) { + Py_ssize_t t; + for (t = 0; t < spec->n_prefix; ++t) + buf[t] = STRINGLIB_TOUPPER(buf[t]); + } + buf += spec->n_prefix; + } + if (spec->n_spadding) { + STRINGLIB_FILL(buf, fill_char, spec->n_spadding); + buf += spec->n_spadding; + } + + /* Only for type 'c' special case, it has no digits. */ + if (spec->n_digits != 0) { + /* Fill the digits with InsertThousandsGrouping. */ +#ifndef NDEBUG + r = +#endif + STRINGLIB_GROUPING(buf, spec->n_grouped_digits, digits, + spec->n_digits, spec->n_min_width, + locale->grouping, locale->thousands_sep); +#ifndef NDEBUG + assert(r == spec->n_grouped_digits); +#endif + p += spec->n_digits; + } + if (toupper) { + Py_ssize_t t; + for (t = 0; t < spec->n_grouped_digits; ++t) + buf[t] = STRINGLIB_TOUPPER(buf[t]); + } + buf += spec->n_grouped_digits; + + if (spec->n_decimal) { + Py_ssize_t t; + for (t = 0; t < spec->n_decimal; ++t) + buf[t] = locale->decimal_point[t]; + buf += spec->n_decimal; + p += 1; + } + + if (spec->n_remainder) { + memcpy(buf, p, spec->n_remainder * sizeof(STRINGLIB_CHAR)); + buf += spec->n_remainder; + p += spec->n_remainder; + } + + if (spec->n_rpadding) { + STRINGLIB_FILL(buf, fill_char, spec->n_rpadding); + buf += spec->n_rpadding; + } +} + +static char no_grouping[1] = {CHAR_MAX}; + +/* Find the decimal point character(s?), thousands_separator(s?), and + grouping description, either for the current locale if type is + LT_CURRENT_LOCALE, a hard-coded locale if LT_DEFAULT_LOCALE, or + none if LT_NO_LOCALE. */ +static void +get_locale_info(int type, LocaleInfo *locale_info) +{ + switch (type) { + case LT_CURRENT_LOCALE: { + struct lconv *locale_data = localeconv(); + locale_info->decimal_point = locale_data->decimal_point; + locale_info->thousands_sep = locale_data->thousands_sep; + locale_info->grouping = locale_data->grouping; + break; + } + case LT_DEFAULT_LOCALE: + locale_info->decimal_point = "."; + locale_info->thousands_sep = ","; + locale_info->grouping = "\3"; /* Group every 3 characters. The + (implicit) trailing 0 means repeat + infinitely. */ + break; + case LT_NO_LOCALE: + locale_info->decimal_point = "."; + locale_info->thousands_sep = ""; + locale_info->grouping = no_grouping; + break; + default: + assert(0); + } +} + +#endif /* FORMAT_FLOAT || FORMAT_LONG || FORMAT_COMPLEX */ + +/************************************************************************/ +/*********** string formatting ******************************************/ +/************************************************************************/ + +static PyObject * +format_string_internal(PyObject *value, const InternalFormatSpec *format) +{ + Py_ssize_t lpad; + Py_ssize_t rpad; + Py_ssize_t total; + STRINGLIB_CHAR *p; + Py_ssize_t len = STRINGLIB_LEN(value); + PyObject *result = NULL; + + /* sign is not allowed on strings */ + if (format->sign != '\0') { + PyErr_SetString(PyExc_ValueError, + "Sign not allowed in string format specifier"); + goto done; + } + + /* alternate is not allowed on strings */ + if (format->alternate) { + PyErr_SetString(PyExc_ValueError, + "Alternate form (#) not allowed in string format " + "specifier"); + goto done; + } + + /* '=' alignment not allowed on strings */ + if (format->align == '=') { + PyErr_SetString(PyExc_ValueError, + "'=' alignment not allowed " + "in string format specifier"); + goto done; + } + + /* if precision is specified, output no more that format.precision + characters */ + if (format->precision >= 0 && len >= format->precision) { + len = format->precision; + } + + calc_padding(len, format->width, format->align, &lpad, &rpad, &total); + + /* allocate the resulting string */ + result = STRINGLIB_NEW(NULL, total); + if (result == NULL) + goto done; + + /* Write into that space. First the padding. */ + p = fill_padding(STRINGLIB_STR(result), len, + format->fill_char, lpad, rpad); + + /* Then the source string. */ + memcpy(p, STRINGLIB_STR(value), len * sizeof(STRINGLIB_CHAR)); + +done: + return result; +} + + +/************************************************************************/ +/*********** long formatting ********************************************/ +/************************************************************************/ + +#if defined FORMAT_LONG || defined FORMAT_INT +typedef PyObject* +(*IntOrLongToString)(PyObject *value, int base); + +static PyObject * +format_int_or_long_internal(PyObject *value, const InternalFormatSpec *format, + IntOrLongToString tostring) +{ + PyObject *result = NULL; + PyObject *tmp = NULL; + STRINGLIB_CHAR *pnumeric_chars; + STRINGLIB_CHAR numeric_char; + STRINGLIB_CHAR sign_char = '\0'; + Py_ssize_t n_digits; /* count of digits need from the computed + string */ + Py_ssize_t n_remainder = 0; /* Used only for 'c' formatting, which + produces non-digits */ + Py_ssize_t n_prefix = 0; /* Count of prefix chars, (e.g., '0x') */ + Py_ssize_t n_total; + STRINGLIB_CHAR *prefix = NULL; + NumberFieldWidths spec; + long x; + + /* Locale settings, either from the actual locale or + from a hard-code pseudo-locale */ + LocaleInfo locale; + + /* no precision allowed on integers */ + if (format->precision != -1) { + PyErr_SetString(PyExc_ValueError, + "Precision not allowed in integer format specifier"); + goto done; + } + + /* special case for character formatting */ + if (format->type == 'c') { + /* error to specify a sign */ + if (format->sign != '\0') { + PyErr_SetString(PyExc_ValueError, + "Sign not allowed with integer" + " format specifier 'c'"); + goto done; + } + + /* Error to specify a comma. */ + if (format->thousands_separators) { + PyErr_SetString(PyExc_ValueError, + "Thousands separators not allowed with integer" + " format specifier 'c'"); + goto done; + } + + /* taken from unicodeobject.c formatchar() */ + /* Integer input truncated to a character */ +/* XXX: won't work for int */ + x = PyLong_AsLong(value); + if (x == -1 && PyErr_Occurred()) + goto done; +#if STRINGLIB_IS_UNICODE +#ifdef Py_UNICODE_WIDE + if (x < 0 || x > 0x10ffff) { + PyErr_SetString(PyExc_OverflowError, + "%c arg not in range(0x110000) " + "(wide Python build)"); + goto done; + } +#else + if (x < 0 || x > 0xffff) { + PyErr_SetString(PyExc_OverflowError, + "%c arg not in range(0x10000) " + "(narrow Python build)"); + goto done; + } +#endif +#else + if (x < 0 || x > 0xff) { + PyErr_SetString(PyExc_OverflowError, + "%c arg not in range(0x100)"); + goto done; + } +#endif + numeric_char = (STRINGLIB_CHAR)x; + pnumeric_chars = &numeric_char; + n_digits = 1; + + /* As a sort-of hack, we tell calc_number_widths that we only + have "remainder" characters. calc_number_widths thinks + these are characters that don't get formatted, only copied + into the output string. We do this for 'c' formatting, + because the characters are likely to be non-digits. */ + n_remainder = 1; + } + else { + int base; + int leading_chars_to_skip = 0; /* Number of characters added by + PyNumber_ToBase that we want to + skip over. */ + + /* Compute the base and how many characters will be added by + PyNumber_ToBase */ + switch (format->type) { + case 'b': + base = 2; + leading_chars_to_skip = 2; /* 0b */ + break; + case 'o': + base = 8; + leading_chars_to_skip = 2; /* 0o */ + break; + case 'x': + case 'X': + base = 16; + leading_chars_to_skip = 2; /* 0x */ + break; + default: /* shouldn't be needed, but stops a compiler warning */ + case 'd': + case 'n': + base = 10; + break; + } + + /* The number of prefix chars is the same as the leading + chars to skip */ + if (format->alternate) + n_prefix = leading_chars_to_skip; + + /* Do the hard part, converting to a string in a given base */ + tmp = tostring(value, base); + if (tmp == NULL) + goto done; + + pnumeric_chars = STRINGLIB_STR(tmp); + n_digits = STRINGLIB_LEN(tmp); + + prefix = pnumeric_chars; + + /* Remember not to modify what pnumeric_chars points to. it + might be interned. Only modify it after we copy it into a + newly allocated output buffer. */ + + /* Is a sign character present in the output? If so, remember it + and skip it */ + if (pnumeric_chars[0] == '-') { + sign_char = pnumeric_chars[0]; + ++prefix; + ++leading_chars_to_skip; + } + + /* Skip over the leading chars (0x, 0b, etc.) */ + n_digits -= leading_chars_to_skip; + pnumeric_chars += leading_chars_to_skip; + } + + /* Determine the grouping, separator, and decimal point, if any. */ + get_locale_info(format->type == 'n' ? LT_CURRENT_LOCALE : + (format->thousands_separators ? + LT_DEFAULT_LOCALE : + LT_NO_LOCALE), + &locale); + + /* Calculate how much memory we'll need. */ + n_total = calc_number_widths(&spec, n_prefix, sign_char, pnumeric_chars, + n_digits, n_remainder, 0, &locale, format); + + /* Allocate the memory. */ + result = STRINGLIB_NEW(NULL, n_total); + if (!result) + goto done; + + /* Populate the memory. */ + fill_number(STRINGLIB_STR(result), &spec, pnumeric_chars, n_digits, + prefix, format->fill_char, &locale, format->type == 'X'); + +done: + Py_XDECREF(tmp); + return result; +} +#endif /* defined FORMAT_LONG || defined FORMAT_INT */ + +/************************************************************************/ +/*********** float formatting *******************************************/ +/************************************************************************/ + +#ifdef FORMAT_FLOAT +#if STRINGLIB_IS_UNICODE +static void +strtounicode(Py_UNICODE *buffer, const char *charbuffer, Py_ssize_t len) +{ + Py_ssize_t i; + for (i = 0; i < len; ++i) + buffer[i] = (Py_UNICODE)charbuffer[i]; +} +#endif + +/* much of this is taken from unicodeobject.c */ +static PyObject * +format_float_internal(PyObject *value, + const InternalFormatSpec *format) +{ + char *buf = NULL; /* buffer returned from PyOS_double_to_string */ + Py_ssize_t n_digits; + Py_ssize_t n_remainder; + Py_ssize_t n_total; + int has_decimal; + double val; + Py_ssize_t precision; + Py_ssize_t default_precision = 6; + STRINGLIB_CHAR type = format->type; + int add_pct = 0; + STRINGLIB_CHAR *p; + NumberFieldWidths spec; + int flags = 0; + PyObject *result = NULL; + STRINGLIB_CHAR sign_char = '\0'; + int float_type; /* Used to see if we have a nan, inf, or regular float. */ + +#if STRINGLIB_IS_UNICODE + Py_UNICODE *unicode_tmp = NULL; +#endif + + /* Locale settings, either from the actual locale or + from a hard-code pseudo-locale */ + LocaleInfo locale; + + if (format->precision > INT_MAX) { + PyErr_SetString(PyExc_ValueError, "precision too big"); + goto done; + } + precision = (int)format->precision; + + /* Alternate is not allowed on floats. */ + if (format->alternate) { + PyErr_SetString(PyExc_ValueError, + "Alternate form (#) not allowed in float format " + "specifier"); + goto done; + } + + if (type == '\0') { + /* Omitted type specifier. This is like 'g' but with at least one + digit after the decimal point, and different default precision.*/ + type = 'g'; + default_precision = PyFloat_STR_PRECISION; + flags |= Py_DTSF_ADD_DOT_0; + } + + if (type == 'n') + /* 'n' is the same as 'g', except for the locale used to + format the result. We take care of that later. */ + type = 'g'; + + val = PyFloat_AsDouble(value); + if (val == -1.0 && PyErr_Occurred()) + goto done; + + if (type == '%') { + type = 'f'; + val *= 100; + add_pct = 1; + } + + if (precision < 0) + precision = default_precision; + + /* Cast "type", because if we're in unicode we need to pass an + 8-bit char. This is safe, because we've restricted what "type" + can be. */ + buf = PyOS_double_to_string(val, (char)type, precision, flags, + &float_type); + if (buf == NULL) + goto done; + n_digits = strlen(buf); + + if (add_pct) { + /* We know that buf has a trailing zero (since we just called + strlen() on it), and we don't use that fact any more. So we + can just write over the trailing zero. */ + buf[n_digits] = '%'; + n_digits += 1; + } + + /* Since there is no unicode version of PyOS_double_to_string, + just use the 8 bit version and then convert to unicode. */ +#if STRINGLIB_IS_UNICODE + unicode_tmp = (Py_UNICODE*)PyMem_Malloc((n_digits)*sizeof(Py_UNICODE)); + if (unicode_tmp == NULL) { + PyErr_NoMemory(); + goto done; + } + strtounicode(unicode_tmp, buf, n_digits); + p = unicode_tmp; +#else + p = buf; +#endif + + /* Is a sign character present in the output? If so, remember it + and skip it */ + if (*p == '-') { + sign_char = *p; + ++p; + --n_digits; + } + + /* Determine if we have any "remainder" (after the digits, might include + decimal or exponent or both (or neither)) */ + parse_number(p, n_digits, &n_remainder, &has_decimal); + + /* Determine the grouping, separator, and decimal point, if any. */ + get_locale_info(format->type == 'n' ? LT_CURRENT_LOCALE : + (format->thousands_separators ? + LT_DEFAULT_LOCALE : + LT_NO_LOCALE), + &locale); + + /* Calculate how much memory we'll need. */ + n_total = calc_number_widths(&spec, 0, sign_char, p, n_digits, + n_remainder, has_decimal, &locale, format); + + /* Allocate the memory. */ + result = STRINGLIB_NEW(NULL, n_total); + if (result == NULL) + goto done; + + /* Populate the memory. */ + fill_number(STRINGLIB_STR(result), &spec, p, n_digits, NULL, + format->fill_char, &locale, 0); + +done: + PyMem_Free(buf); +#if STRINGLIB_IS_UNICODE + PyMem_Free(unicode_tmp); +#endif + return result; +} +#endif /* FORMAT_FLOAT */ + +/************************************************************************/ +/*********** complex formatting *****************************************/ +/************************************************************************/ + +#ifdef FORMAT_COMPLEX + +static PyObject * +format_complex_internal(PyObject *value, + const InternalFormatSpec *format) +{ + double re; + double im; + char *re_buf = NULL; /* buffer returned from PyOS_double_to_string */ + char *im_buf = NULL; /* buffer returned from PyOS_double_to_string */ + + InternalFormatSpec tmp_format = *format; + Py_ssize_t n_re_digits; + Py_ssize_t n_im_digits; + Py_ssize_t n_re_remainder; + Py_ssize_t n_im_remainder; + Py_ssize_t n_re_total; + Py_ssize_t n_im_total; + int re_has_decimal; + int im_has_decimal; + Py_ssize_t precision; + Py_ssize_t default_precision = 6; + STRINGLIB_CHAR type = format->type; + STRINGLIB_CHAR *p_re; + STRINGLIB_CHAR *p_im; + NumberFieldWidths re_spec; + NumberFieldWidths im_spec; + int flags = 0; + PyObject *result = NULL; + STRINGLIB_CHAR *p; + STRINGLIB_CHAR re_sign_char = '\0'; + STRINGLIB_CHAR im_sign_char = '\0'; + int re_float_type; /* Used to see if we have a nan, inf, or regular float. */ + int im_float_type; + int add_parens = 0; + int skip_re = 0; + Py_ssize_t lpad; + Py_ssize_t rpad; + Py_ssize_t total; + +#if STRINGLIB_IS_UNICODE + Py_UNICODE *re_unicode_tmp = NULL; + Py_UNICODE *im_unicode_tmp = NULL; +#endif + + /* Locale settings, either from the actual locale or + from a hard-code pseudo-locale */ + LocaleInfo locale; + + if (format->precision > INT_MAX) { + PyErr_SetString(PyExc_ValueError, "precision too big"); + goto done; + } + precision = (int)format->precision; + + /* Alternate is not allowed on complex. */ + if (format->alternate) { + PyErr_SetString(PyExc_ValueError, + "Alternate form (#) not allowed in complex format " + "specifier"); + goto done; + } + + /* Neither is zero pading. */ + if (format->fill_char == '0') { + PyErr_SetString(PyExc_ValueError, + "Zero padding is not allowed in complex format " + "specifier"); + goto done; + } + + /* Neither is '=' alignment . */ + if (format->align == '=') { + PyErr_SetString(PyExc_ValueError, + "'=' alignment flag is not allowed in complex format " + "specifier"); + goto done; + } + + re = PyComplex_RealAsDouble(value); + if (re == -1.0 && PyErr_Occurred()) + goto done; + im = PyComplex_ImagAsDouble(value); + if (im == -1.0 && PyErr_Occurred()) + goto done; + + if (type == '\0') { + /* Omitted type specifier. Should be like str(self). */ + type = 'g'; + default_precision = PyFloat_STR_PRECISION; + if (re == 0.0 && copysign(1.0, re) == 1.0) + skip_re = 1; + else + add_parens = 1; + } + + if (type == 'n') + /* 'n' is the same as 'g', except for the locale used to + format the result. We take care of that later. */ + type = 'g'; + + if (precision < 0) + precision = default_precision; + + /* Cast "type", because if we're in unicode we need to pass an + 8-bit char. This is safe, because we've restricted what "type" + can be. */ + re_buf = PyOS_double_to_string(re, (char)type, precision, flags, + &re_float_type); + if (re_buf == NULL) + goto done; + im_buf = PyOS_double_to_string(im, (char)type, precision, flags, + &im_float_type); + if (im_buf == NULL) + goto done; + + n_re_digits = strlen(re_buf); + n_im_digits = strlen(im_buf); + + /* Since there is no unicode version of PyOS_double_to_string, + just use the 8 bit version and then convert to unicode. */ +#if STRINGLIB_IS_UNICODE + re_unicode_tmp = (Py_UNICODE*)PyMem_Malloc((n_re_digits)*sizeof(Py_UNICODE)); + if (re_unicode_tmp == NULL) { + PyErr_NoMemory(); + goto done; + } + strtounicode(re_unicode_tmp, re_buf, n_re_digits); + p_re = re_unicode_tmp; + + im_unicode_tmp = (Py_UNICODE*)PyMem_Malloc((n_im_digits)*sizeof(Py_UNICODE)); + if (im_unicode_tmp == NULL) { + PyErr_NoMemory(); + goto done; + } + strtounicode(im_unicode_tmp, im_buf, n_im_digits); + p_im = im_unicode_tmp; +#else + p_re = re_buf; + p_im = im_buf; +#endif + + /* Is a sign character present in the output? If so, remember it + and skip it */ + if (*p_re == '-') { + re_sign_char = *p_re; + ++p_re; + --n_re_digits; + } + if (*p_im == '-') { + im_sign_char = *p_im; + ++p_im; + --n_im_digits; + } + + /* Determine if we have any "remainder" (after the digits, might include + decimal or exponent or both (or neither)) */ + parse_number(p_re, n_re_digits, &n_re_remainder, &re_has_decimal); + parse_number(p_im, n_im_digits, &n_im_remainder, &im_has_decimal); + + /* Determine the grouping, separator, and decimal point, if any. */ + get_locale_info(format->type == 'n' ? LT_CURRENT_LOCALE : + (format->thousands_separators ? + LT_DEFAULT_LOCALE : + LT_NO_LOCALE), + &locale); + + /* Turn off any padding. We'll do it later after we've composed + the numbers without padding. */ + tmp_format.fill_char = '\0'; + tmp_format.align = '<'; + tmp_format.width = -1; + + /* Calculate how much memory we'll need. */ + n_re_total = calc_number_widths(&re_spec, 0, re_sign_char, p_re, + n_re_digits, n_re_remainder, + re_has_decimal, &locale, &tmp_format); + + /* Same formatting, but always include a sign, unless the real part is + * going to be omitted, in which case we use whatever sign convention was + * requested by the original format. */ + if (!skip_re) + tmp_format.sign = '+'; + n_im_total = calc_number_widths(&im_spec, 0, im_sign_char, p_im, + n_im_digits, n_im_remainder, + im_has_decimal, &locale, &tmp_format); + + if (skip_re) + n_re_total = 0; + + /* Add 1 for the 'j', and optionally 2 for parens. */ + calc_padding(n_re_total + n_im_total + 1 + add_parens * 2, + format->width, format->align, &lpad, &rpad, &total); + + result = STRINGLIB_NEW(NULL, total); + if (result == NULL) + goto done; + + /* Populate the memory. First, the padding. */ + p = fill_padding(STRINGLIB_STR(result), + n_re_total + n_im_total + 1 + add_parens * 2, + format->fill_char, lpad, rpad); + + if (add_parens) + *p++ = '('; + + if (!skip_re) { + fill_number(p, &re_spec, p_re, n_re_digits, NULL, 0, &locale, 0); + p += n_re_total; + } + fill_number(p, &im_spec, p_im, n_im_digits, NULL, 0, &locale, 0); + p += n_im_total; + *p++ = 'j'; + + if (add_parens) + *p++ = ')'; + +done: + PyMem_Free(re_buf); + PyMem_Free(im_buf); +#if STRINGLIB_IS_UNICODE + PyMem_Free(re_unicode_tmp); + PyMem_Free(im_unicode_tmp); +#endif + return result; +} +#endif /* FORMAT_COMPLEX */ + +/************************************************************************/ +/*********** built in formatters ****************************************/ +/************************************************************************/ +PyObject * +FORMAT_STRING(PyObject *obj, + STRINGLIB_CHAR *format_spec, + Py_ssize_t format_spec_len) +{ + InternalFormatSpec format; + PyObject *result = NULL; + + /* check for the special case of zero length format spec, make + it equivalent to str(obj) */ + if (format_spec_len == 0) { + result = STRINGLIB_TOSTR(obj); + goto done; + } + + /* parse the format_spec */ + if (!parse_internal_render_format_spec(format_spec, format_spec_len, + &format, 's', '<')) + goto done; + + /* type conversion? */ + switch (format.type) { + case 's': + /* no type conversion needed, already a string. do the formatting */ + result = format_string_internal(obj, &format); + break; + default: + /* unknown */ + unknown_presentation_type(format.type, obj->ob_type->tp_name); + goto done; + } + +done: + return result; +} + +#if defined FORMAT_LONG || defined FORMAT_INT +static PyObject* +format_int_or_long(PyObject* obj, + STRINGLIB_CHAR *format_spec, + Py_ssize_t format_spec_len, + IntOrLongToString tostring) +{ + PyObject *result = NULL; + PyObject *tmp = NULL; + InternalFormatSpec format; + + /* check for the special case of zero length format spec, make + it equivalent to str(obj) */ + if (format_spec_len == 0) { + result = STRINGLIB_TOSTR(obj); + goto done; + } + + /* parse the format_spec */ + if (!parse_internal_render_format_spec(format_spec, + format_spec_len, + &format, 'd', '>')) + goto done; + + /* type conversion? */ + switch (format.type) { + case 'b': + case 'c': + case 'd': + case 'o': + case 'x': + case 'X': + case 'n': + /* no type conversion needed, already an int (or long). do + the formatting */ + result = format_int_or_long_internal(obj, &format, tostring); + break; + + case 'e': + case 'E': + case 'f': + case 'F': + case 'g': + case 'G': + case '%': + /* convert to float */ + tmp = PyNumber_Float(obj); + if (tmp == NULL) + goto done; + result = format_float_internal(tmp, &format); + break; + + default: + /* unknown */ + unknown_presentation_type(format.type, obj->ob_type->tp_name); + goto done; + } + +done: + Py_XDECREF(tmp); + return result; +} +#endif /* FORMAT_LONG || defined FORMAT_INT */ + +#ifdef FORMAT_LONG +/* Need to define long_format as a function that will convert a long + to a string. In 3.0, _PyLong_Format has the correct signature. In + 2.x, we need to fudge a few parameters */ +#if PY_VERSION_HEX >= 0x03000000 +#define long_format _PyLong_Format +#else +static PyObject* +long_format(PyObject* value, int base) +{ + /* Convert to base, don't add trailing 'L', and use the new octal + format. We already know this is a long object */ + assert(PyLong_Check(value)); + /* convert to base, don't add 'L', and use the new octal format */ + return _PyLong_Format(value, base, 0, 1); +} +#endif + +PyObject * +FORMAT_LONG(PyObject *obj, + STRINGLIB_CHAR *format_spec, + Py_ssize_t format_spec_len) +{ + return format_int_or_long(obj, format_spec, format_spec_len, + long_format); +} +#endif /* FORMAT_LONG */ + +#ifdef FORMAT_INT +/* this is only used for 2.x, not 3.0 */ +static PyObject* +int_format(PyObject* value, int base) +{ + /* Convert to base, and use the new octal format. We already + know this is an int object */ + assert(PyInt_Check(value)); + return _PyInt_Format((PyIntObject*)value, base, 1); +} + +PyObject * +FORMAT_INT(PyObject *obj, + STRINGLIB_CHAR *format_spec, + Py_ssize_t format_spec_len) +{ + return format_int_or_long(obj, format_spec, format_spec_len, + int_format); +} +#endif /* FORMAT_INT */ + +#ifdef FORMAT_FLOAT +PyObject * +FORMAT_FLOAT(PyObject *obj, + STRINGLIB_CHAR *format_spec, + Py_ssize_t format_spec_len) +{ + PyObject *result = NULL; + InternalFormatSpec format; + + /* check for the special case of zero length format spec, make + it equivalent to str(obj) */ + if (format_spec_len == 0) { + result = STRINGLIB_TOSTR(obj); + goto done; + } + + /* parse the format_spec */ + if (!parse_internal_render_format_spec(format_spec, + format_spec_len, + &format, '\0', '>')) + goto done; + + /* type conversion? */ + switch (format.type) { + case '\0': /* No format code: like 'g', but with at least one decimal. */ + case 'e': + case 'E': + case 'f': + case 'F': + case 'g': + case 'G': + case 'n': + case '%': + /* no conversion, already a float. do the formatting */ + result = format_float_internal(obj, &format); + break; + + default: + /* unknown */ + unknown_presentation_type(format.type, obj->ob_type->tp_name); + goto done; + } + +done: + return result; +} +#endif /* FORMAT_FLOAT */ + +#ifdef FORMAT_COMPLEX +PyObject * +FORMAT_COMPLEX(PyObject *obj, + STRINGLIB_CHAR *format_spec, + Py_ssize_t format_spec_len) +{ + PyObject *result = NULL; + InternalFormatSpec format; + + /* check for the special case of zero length format spec, make + it equivalent to str(obj) */ + if (format_spec_len == 0) { + result = STRINGLIB_TOSTR(obj); + goto done; + } + + /* parse the format_spec */ + if (!parse_internal_render_format_spec(format_spec, + format_spec_len, + &format, '\0', '>')) + goto done; + + /* type conversion? */ + switch (format.type) { + case '\0': /* No format code: like 'g', but with at least one decimal. */ + case 'e': + case 'E': + case 'f': + case 'F': + case 'g': + case 'G': + case 'n': + /* no conversion, already a complex. do the formatting */ + result = format_complex_internal(obj, &format); + break; + + default: + /* unknown */ + unknown_presentation_type(format.type, obj->ob_type->tp_name); + goto done; + } + +done: + return result; +} +#endif /* FORMAT_COMPLEX */ diff --git a/Objects/stringlib/join.h b/Objects/stringlib/join.h deleted file mode 100644 index 6f314e1..0000000 --- a/Objects/stringlib/join.h +++ /dev/null @@ -1,140 +0,0 @@ -/* stringlib: bytes joining implementation */ - -#if STRINGLIB_IS_UNICODE -#error join.h only compatible with byte-wise strings -#endif - -Py_LOCAL_INLINE(PyObject *) -STRINGLIB(bytes_join)(PyObject *sep, PyObject *iterable) -{ - char *sepstr = STRINGLIB_STR(sep); - const Py_ssize_t seplen = STRINGLIB_LEN(sep); - PyObject *res = NULL; - char *p; - Py_ssize_t seqlen = 0; - Py_ssize_t sz = 0; - Py_ssize_t i, nbufs; - PyObject *seq, *item; - Py_buffer *buffers = NULL; -#define NB_STATIC_BUFFERS 10 - Py_buffer static_buffers[NB_STATIC_BUFFERS]; - - seq = PySequence_Fast(iterable, "can only join an iterable"); - if (seq == NULL) { - return NULL; - } - - seqlen = PySequence_Fast_GET_SIZE(seq); - if (seqlen == 0) { - Py_DECREF(seq); - return STRINGLIB_NEW(NULL, 0); - } -#ifndef STRINGLIB_MUTABLE - if (seqlen == 1) { - item = PySequence_Fast_GET_ITEM(seq, 0); - if (STRINGLIB_CHECK_EXACT(item)) { - Py_INCREF(item); - Py_DECREF(seq); - return item; - } - } -#endif - if (seqlen > NB_STATIC_BUFFERS) { - buffers = PyMem_NEW(Py_buffer, seqlen); - if (buffers == NULL) { - Py_DECREF(seq); - PyErr_NoMemory(); - return NULL; - } - } - else { - buffers = static_buffers; - } - - /* Here is the general case. Do a pre-pass to figure out the total - * amount of space we'll need (sz), and see whether all arguments are - * bytes-like. - */ - for (i = 0, nbufs = 0; i < seqlen; i++) { - Py_ssize_t itemlen; - item = PySequence_Fast_GET_ITEM(seq, i); - if (PyBytes_CheckExact(item)) { - /* Fast path. */ - Py_INCREF(item); - buffers[i].obj = item; - buffers[i].buf = PyBytes_AS_STRING(item); - buffers[i].len = PyBytes_GET_SIZE(item); - } - else if (PyObject_GetBuffer(item, &buffers[i], PyBUF_SIMPLE) != 0) { - PyErr_Format(PyExc_TypeError, - "sequence item %zd: expected a bytes-like object, " - "%.80s found", - i, Py_TYPE(item)->tp_name); - goto error; - } - nbufs = i + 1; /* for error cleanup */ - itemlen = buffers[i].len; - if (itemlen > PY_SSIZE_T_MAX - sz) { - PyErr_SetString(PyExc_OverflowError, - "join() result is too long"); - goto error; - } - sz += itemlen; - if (i != 0) { - if (seplen > PY_SSIZE_T_MAX - sz) { - PyErr_SetString(PyExc_OverflowError, - "join() result is too long"); - goto error; - } - sz += seplen; - } - if (seqlen != PySequence_Fast_GET_SIZE(seq)) { - PyErr_SetString(PyExc_RuntimeError, - "sequence changed size during iteration"); - goto error; - } - } - - /* Allocate result space. */ - res = STRINGLIB_NEW(NULL, sz); - if (res == NULL) - goto error; - - /* Catenate everything. */ - p = STRINGLIB_STR(res); - if (!seplen) { - /* fast path */ - for (i = 0; i < nbufs; i++) { - Py_ssize_t n = buffers[i].len; - char *q = buffers[i].buf; - memcpy(p, q, n); - p += n; - } - goto done; - } - for (i = 0; i < nbufs; i++) { - Py_ssize_t n; - char *q; - if (i) { - memcpy(p, sepstr, seplen); - p += seplen; - } - n = buffers[i].len; - q = buffers[i].buf; - memcpy(p, q, n); - p += n; - } - goto done; - -error: - res = NULL; -done: - Py_DECREF(seq); - for (i = 0; i < nbufs; i++) - PyBuffer_Release(&buffers[i]); - if (buffers != static_buffers) - PyMem_FREE(buffers); - return res; -} - -#undef NB_STATIC_BUFFERS diff --git a/Objects/stringlib/localeutil.h b/Objects/stringlib/localeutil.h index bd16e0a..f548133 100644 --- a/Objects/stringlib/localeutil.h +++ b/Objects/stringlib/localeutil.h @@ -1,4 +1,12 @@ -/* _PyUnicode_InsertThousandsGrouping() helper functions */ +/* stringlib: locale related helpers implementation */ + +#ifndef STRINGLIB_LOCALEUTIL_H +#define STRINGLIB_LOCALEUTIL_H + +#include <locale.h> + +#define MAX(x, y) ((x) < (y) ? (y) : (x)) +#define MIN(x, y) ((x) < (y) ? (x) : (y)) typedef struct { const char *grouping; @@ -6,19 +14,17 @@ typedef struct { Py_ssize_t i; /* Where we're currently pointing in grouping. */ } GroupGenerator; - static void -GroupGenerator_init(GroupGenerator *self, const char *grouping) +_GroupGenerator_init(GroupGenerator *self, const char *grouping) { self->grouping = grouping; self->i = 0; self->previous = 0; } - /* Returns the next grouping, or 0 to signify end. */ static Py_ssize_t -GroupGenerator_next(GroupGenerator *self) +_GroupGenerator_next(GroupGenerator *self) { /* Note that we don't really do much error checking here. If a grouping string contains just CHAR_MAX, for example, then just @@ -39,44 +45,168 @@ GroupGenerator_next(GroupGenerator *self) } } - /* Fill in some digits, leading zeros, and thousands separator. All are optional, depending on when we're called. */ static void -InsertThousandsGrouping_fill(_PyUnicodeWriter *writer, Py_ssize_t *buffer_pos, - PyObject *digits, Py_ssize_t *digits_pos, - Py_ssize_t n_chars, Py_ssize_t n_zeros, - PyObject *thousands_sep, Py_ssize_t thousands_sep_len, - Py_UCS4 *maxchar) +fill(STRINGLIB_CHAR **digits_end, STRINGLIB_CHAR **buffer_end, + Py_ssize_t n_chars, Py_ssize_t n_zeros, const char* thousands_sep, + Py_ssize_t thousands_sep_len) { - if (!writer) { - /* if maxchar > 127, maxchar is already set */ - if (*maxchar == 127 && thousands_sep) { - Py_UCS4 maxchar2 = PyUnicode_MAX_CHAR_VALUE(thousands_sep); - *maxchar = Py_MAX(*maxchar, maxchar2); - } - return; - } +#if STRINGLIB_IS_UNICODE + Py_ssize_t i; +#endif if (thousands_sep) { - *buffer_pos -= thousands_sep_len; + *buffer_end -= thousands_sep_len; /* Copy the thousands_sep chars into the buffer. */ - _PyUnicode_FastCopyCharacters(writer->buffer, *buffer_pos, - thousands_sep, 0, - thousands_sep_len); +#if STRINGLIB_IS_UNICODE + /* Convert from the char's of the thousands_sep from + the locale into unicode. */ + for (i = 0; i < thousands_sep_len; ++i) + (*buffer_end)[i] = thousands_sep[i]; +#else + /* No conversion, just memcpy the thousands_sep. */ + memcpy(*buffer_end, thousands_sep, thousands_sep_len); +#endif } - *buffer_pos -= n_chars; - *digits_pos -= n_chars; - _PyUnicode_FastCopyCharacters(writer->buffer, *buffer_pos, - digits, *digits_pos, - n_chars); - - if (n_zeros) { - *buffer_pos -= n_zeros; - enum PyUnicode_Kind kind = PyUnicode_KIND(writer->buffer); - void *data = PyUnicode_DATA(writer->buffer); - unicode_fill(kind, data, '0', *buffer_pos, n_zeros); + *buffer_end -= n_chars; + *digits_end -= n_chars; + memcpy(*buffer_end, *digits_end, n_chars * sizeof(STRINGLIB_CHAR)); + + *buffer_end -= n_zeros; + STRINGLIB_FILL(*buffer_end, '0', n_zeros); +} + +/** + * _Py_InsertThousandsGrouping: + * @buffer: A pointer to the start of a string. + * @n_buffer: Number of characters in @buffer. + * @digits: A pointer to the digits we're reading from. If count + * is non-NULL, this is unused. + * @n_digits: The number of digits in the string, in which we want + * to put the grouping chars. + * @min_width: The minimum width of the digits in the output string. + * Output will be zero-padded on the left to fill. + * @grouping: see definition in localeconv(). + * @thousands_sep: see definition in localeconv(). + * + * There are 2 modes: counting and filling. If @buffer is NULL, + * we are in counting mode, else filling mode. + * If counting, the required buffer size is returned. + * If filling, we know the buffer will be large enough, so we don't + * need to pass in the buffer size. + * Inserts thousand grouping characters (as defined by grouping and + * thousands_sep) into the string between buffer and buffer+n_digits. + * + * Return value: 0 on error, else 1. Note that no error can occur if + * count is non-NULL. + * + * This name won't be used, the includer of this file should define + * it to be the actual function name, based on unicode or string. + * + * As closely as possible, this code mimics the logic in decimal.py's + _insert_thousands_sep(). + **/ +Py_ssize_t +_Py_InsertThousandsGrouping(STRINGLIB_CHAR *buffer, + Py_ssize_t n_buffer, + STRINGLIB_CHAR *digits, + Py_ssize_t n_digits, + Py_ssize_t min_width, + const char *grouping, + const char *thousands_sep) +{ + Py_ssize_t count = 0; + Py_ssize_t n_zeros; + int loop_broken = 0; + int use_separator = 0; /* First time through, don't append the + separator. They only go between + groups. */ + STRINGLIB_CHAR *buffer_end = NULL; + STRINGLIB_CHAR *digits_end = NULL; + Py_ssize_t l; + Py_ssize_t n_chars; + Py_ssize_t thousands_sep_len = strlen(thousands_sep); + Py_ssize_t remaining = n_digits; /* Number of chars remaining to + be looked at */ + /* A generator that returns all of the grouping widths, until it + returns 0. */ + GroupGenerator groupgen; + _GroupGenerator_init(&groupgen, grouping); + + if (buffer) { + buffer_end = buffer + n_buffer; + digits_end = digits + n_digits; } + + while ((l = _GroupGenerator_next(&groupgen)) > 0) { + l = MIN(l, MAX(MAX(remaining, min_width), 1)); + n_zeros = MAX(0, l - remaining); + n_chars = MAX(0, MIN(remaining, l)); + + /* Use n_zero zero's and n_chars chars */ + + /* Count only, don't do anything. */ + count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars; + + if (buffer) { + /* Copy into the output buffer. */ + fill(&digits_end, &buffer_end, n_chars, n_zeros, + use_separator ? thousands_sep : NULL, thousands_sep_len); + } + + /* Use a separator next time. */ + use_separator = 1; + + remaining -= n_chars; + min_width -= l; + + if (remaining <= 0 && min_width <= 0) { + loop_broken = 1; + break; + } + min_width -= thousands_sep_len; + } + if (!loop_broken) { + /* We left the loop without using a break statement. */ + + l = MAX(MAX(remaining, min_width), 1); + n_zeros = MAX(0, l - remaining); + n_chars = MAX(0, MIN(remaining, l)); + + /* Use n_zero zero's and n_chars chars */ + count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars; + if (buffer) { + /* Copy into the output buffer. */ + fill(&digits_end, &buffer_end, n_chars, n_zeros, + use_separator ? thousands_sep : NULL, thousands_sep_len); + } + } + return count; +} + +/** + * _Py_InsertThousandsGroupingLocale: + * @buffer: A pointer to the start of a string. + * @n_digits: The number of digits in the string, in which we want + * to put the grouping chars. + * + * Reads thee current locale and calls _Py_InsertThousandsGrouping(). + **/ +Py_ssize_t +_Py_InsertThousandsGroupingLocale(STRINGLIB_CHAR *buffer, + Py_ssize_t n_buffer, + STRINGLIB_CHAR *digits, + Py_ssize_t n_digits, + Py_ssize_t min_width) +{ + struct lconv *locale_data = localeconv(); + const char *grouping = locale_data->grouping; + const char *thousands_sep = locale_data->thousands_sep; + + return _Py_InsertThousandsGrouping(buffer, n_buffer, digits, n_digits, + min_width, grouping, thousands_sep); } +#endif /* STRINGLIB_LOCALEUTIL_H */ diff --git a/Objects/stringlib/partition.h b/Objects/stringlib/partition.h index ed32a6f..0170bdd 100644 --- a/Objects/stringlib/partition.h +++ b/Objects/stringlib/partition.h @@ -1,11 +1,14 @@ /* stringlib: partition implementation */ +#ifndef STRINGLIB_PARTITION_H +#define STRINGLIB_PARTITION_H + #ifndef STRINGLIB_FASTSEARCH_H #error must include "stringlib/fastsearch.h" before including this module #endif Py_LOCAL_INLINE(PyObject*) -STRINGLIB(partition)(PyObject* str_obj, +stringlib_partition(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, PyObject* sep_obj, const STRINGLIB_CHAR* sep, Py_ssize_t sep_len) @@ -22,18 +25,13 @@ STRINGLIB(partition)(PyObject* str_obj, if (!out) return NULL; - pos = FASTSEARCH(str, str_len, sep, sep_len, -1, FAST_SEARCH); + pos = fastsearch(str, str_len, sep, sep_len, -1, FAST_SEARCH); if (pos < 0) { #if STRINGLIB_MUTABLE PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(str, str_len)); PyTuple_SET_ITEM(out, 1, STRINGLIB_NEW(NULL, 0)); PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(NULL, 0)); - - if (PyErr_Occurred()) { - Py_DECREF(out); - return NULL; - } #else Py_INCREF(str_obj); PyTuple_SET_ITEM(out, 0, (PyObject*) str_obj); @@ -60,7 +58,7 @@ STRINGLIB(partition)(PyObject* str_obj, } Py_LOCAL_INLINE(PyObject*) -STRINGLIB(rpartition)(PyObject* str_obj, +stringlib_rpartition(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, PyObject* sep_obj, const STRINGLIB_CHAR* sep, Py_ssize_t sep_len) @@ -77,18 +75,13 @@ STRINGLIB(rpartition)(PyObject* str_obj, if (!out) return NULL; - pos = FASTSEARCH(str, str_len, sep, sep_len, -1, FAST_RSEARCH); + pos = fastsearch(str, str_len, sep, sep_len, -1, FAST_RSEARCH); if (pos < 0) { #if STRINGLIB_MUTABLE PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(NULL, 0)); PyTuple_SET_ITEM(out, 1, STRINGLIB_NEW(NULL, 0)); PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(str, str_len)); - - if (PyErr_Occurred()) { - Py_DECREF(out); - return NULL; - } #else Py_INCREF(STRINGLIB_EMPTY); PyTuple_SET_ITEM(out, 0, (PyObject*) STRINGLIB_EMPTY); @@ -114,3 +107,4 @@ STRINGLIB(rpartition)(PyObject* str_obj, return out; } +#endif diff --git a/Objects/stringlib/replace.h b/Objects/stringlib/replace.h deleted file mode 100644 index ef318ed..0000000 --- a/Objects/stringlib/replace.h +++ /dev/null @@ -1,53 +0,0 @@ -/* stringlib: replace implementation */ - -#ifndef STRINGLIB_FASTSEARCH_H -#error must include "stringlib/fastsearch.h" before including this module -#endif - -Py_LOCAL_INLINE(void) -STRINGLIB(replace_1char_inplace)(STRINGLIB_CHAR* s, STRINGLIB_CHAR* end, - Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount) -{ - *s = u2; - while (--maxcount && ++s != end) { - /* Find the next character to be replaced. - - If it occurs often, it is faster to scan for it using an inline - loop. If it occurs seldom, it is faster to scan for it using a - function call; the overhead of the function call is amortized - across the many characters that call covers. We start with an - inline loop and use a heuristic to determine whether to fall back - to a function call. */ - if (*s != u1) { - int attempts = 10; - /* search u1 in a dummy loop */ - while (1) { - if (++s == end) - return; - if (*s == u1) - break; - if (!--attempts) { - /* if u1 was not found for attempts iterations, - use FASTSEARCH() or memchr() */ -#if STRINGLIB_SIZEOF_CHAR == 1 - s++; - s = memchr(s, u1, end - s); - if (s == NULL) - return; -#else - Py_ssize_t i; - STRINGLIB_CHAR ch1 = (STRINGLIB_CHAR) u1; - s++; - i = FASTSEARCH(s, end - s, &ch1, 1, 0, FAST_SEARCH); - if (i < 0) - return; - s += i; -#endif - /* restart the dummy loop */ - break; - } - } - } - *s = u2; - } -} diff --git a/Objects/stringlib/split.h b/Objects/stringlib/split.h index 31f77a7..60e7767 100644 --- a/Objects/stringlib/split.h +++ b/Objects/stringlib/split.h @@ -1,5 +1,8 @@ /* stringlib: split implementation */ +#ifndef STRINGLIB_SPLIT_H +#define STRINGLIB_SPLIT_H + #ifndef STRINGLIB_FASTSEARCH_H #error must include "stringlib/fastsearch.h" before including this module #endif @@ -51,7 +54,7 @@ #define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count Py_LOCAL_INLINE(PyObject *) -STRINGLIB(split_whitespace)(PyObject* str_obj, +stringlib_split_whitespace(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, Py_ssize_t maxcount) { @@ -99,7 +102,7 @@ STRINGLIB(split_whitespace)(PyObject* str_obj, } Py_LOCAL_INLINE(PyObject *) -STRINGLIB(split_char)(PyObject* str_obj, +stringlib_split_char(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR ch, Py_ssize_t maxcount) @@ -142,7 +145,7 @@ STRINGLIB(split_char)(PyObject* str_obj, } Py_LOCAL_INLINE(PyObject *) -STRINGLIB(split)(PyObject* str_obj, +stringlib_split(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, Py_ssize_t maxcount) @@ -155,7 +158,7 @@ STRINGLIB(split)(PyObject* str_obj, return NULL; } else if (sep_len == 1) - return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount); + return stringlib_split_char(str_obj, str, str_len, sep[0], maxcount); list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) @@ -163,7 +166,7 @@ STRINGLIB(split)(PyObject* str_obj, i = j = 0; while (maxcount-- > 0) { - pos = FASTSEARCH(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH); + pos = fastsearch(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH); if (pos < 0) break; j = i + pos; @@ -190,7 +193,7 @@ STRINGLIB(split)(PyObject* str_obj, } Py_LOCAL_INLINE(PyObject *) -STRINGLIB(rsplit_whitespace)(PyObject* str_obj, +stringlib_rsplit_whitespace(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, Py_ssize_t maxcount) { @@ -240,7 +243,7 @@ STRINGLIB(rsplit_whitespace)(PyObject* str_obj, } Py_LOCAL_INLINE(PyObject *) -STRINGLIB(rsplit_char)(PyObject* str_obj, +stringlib_rsplit_char(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR ch, Py_ssize_t maxcount) @@ -284,7 +287,7 @@ STRINGLIB(rsplit_char)(PyObject* str_obj, } Py_LOCAL_INLINE(PyObject *) -STRINGLIB(rsplit)(PyObject* str_obj, +stringlib_rsplit(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, Py_ssize_t maxcount) @@ -297,7 +300,7 @@ STRINGLIB(rsplit)(PyObject* str_obj, return NULL; } else if (sep_len == 1) - return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount); + return stringlib_rsplit_char(str_obj, str, str_len, sep[0], maxcount); list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) @@ -305,7 +308,7 @@ STRINGLIB(rsplit)(PyObject* str_obj, j = str_len; while (maxcount-- > 0) { - pos = FASTSEARCH(str, j, sep, sep_len, -1, FAST_RSEARCH); + pos = fastsearch(str, j, sep, sep_len, -1, FAST_RSEARCH); if (pos < 0) break; SPLIT_ADD(str, pos + sep_len, j); @@ -333,7 +336,7 @@ STRINGLIB(rsplit)(PyObject* str_obj, } Py_LOCAL_INLINE(PyObject *) -STRINGLIB(splitlines)(PyObject* str_obj, +stringlib_splitlines(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, int keepends) { @@ -345,8 +348,8 @@ STRINGLIB(splitlines)(PyObject* str_obj, and the appends only done when the prealloc buffer is full. That's too much work for little gain.*/ - Py_ssize_t i; - Py_ssize_t j; + register Py_ssize_t i; + register Py_ssize_t j; PyObject *list = PyList_New(0); PyObject *sub; @@ -388,3 +391,4 @@ STRINGLIB(splitlines)(PyObject* str_obj, return NULL; } +#endif diff --git a/Objects/stringlib/unicode_format.h b/Objects/stringlib/string_format.h index b526ad2..2bd1839 100644 --- a/Objects/stringlib/unicode_format.h +++ b/Objects/stringlib/string_format.h @@ -1,7 +1,22 @@ /* - unicode_format.h -- implementation of str.format(). + string_format.h -- implementation of string.format(). + + It uses the Objects/stringlib conventions, so that it can be + compiled for both unicode and string objects. */ + +/* Defines for Python 2.6 compatibility */ +#if PY_VERSION_HEX < 0x03000000 +#define PyLong_FromSsize_t _PyLong_FromSsize_t +#endif + +/* Defines for more efficiently reallocating the string buffer */ +#define INITIAL_SIZE_INCREMENT 100 +#define SIZE_MULTIPLIER 2 +#define MAX_SIZE_INCREMENT 3200 + + /************************************************************************/ /*********** Global data structures and forward declarations *********/ /************************************************************************/ @@ -11,8 +26,8 @@ unicode pointers. */ typedef struct { - PyObject *str; /* borrowed reference */ - Py_ssize_t start, end; + STRINGLIB_CHAR *ptr; + STRINGLIB_CHAR *end; } SubString; @@ -49,30 +64,34 @@ AutoNumber_Init(AutoNumber *auto_number) /* fill in a SubString from a pointer and length */ Py_LOCAL_INLINE(void) -SubString_init(SubString *str, PyObject *s, Py_ssize_t start, Py_ssize_t end) +SubString_init(SubString *str, STRINGLIB_CHAR *p, Py_ssize_t len) { - str->str = s; - str->start = start; - str->end = end; + str->ptr = p; + if (p == NULL) + str->end = NULL; + else + str->end = str->ptr + len; } -/* return a new string. if str->str is NULL, return None */ +/* return a new string. if str->ptr is NULL, return None */ Py_LOCAL_INLINE(PyObject *) SubString_new_object(SubString *str) { - if (str->str == NULL) - Py_RETURN_NONE; - return PyUnicode_Substring(str->str, str->start, str->end); + if (str->ptr == NULL) { + Py_INCREF(Py_None); + return Py_None; + } + return STRINGLIB_NEW(str->ptr, str->end - str->ptr); } -/* return a new string. if str->str is NULL, return a new empty string */ +/* return a new string. if str->ptr is NULL, return None */ Py_LOCAL_INLINE(PyObject *) SubString_new_object_or_empty(SubString *str) { - if (str->str == NULL) { - return PyUnicode_New(0, 0); + if (str->ptr == NULL) { + return STRINGLIB_NEW(NULL, 0); } - return SubString_new_object(str); + return STRINGLIB_NEW(str->ptr, str->end - str->ptr); } /* Return 1 if an error has been detected switching between automatic @@ -102,6 +121,74 @@ autonumber_state_error(AutoNumberState state, int field_name_is_empty) /************************************************************************/ +/*********** Output string management functions ****************/ +/************************************************************************/ + +typedef struct { + STRINGLIB_CHAR *ptr; + STRINGLIB_CHAR *end; + PyObject *obj; + Py_ssize_t size_increment; +} OutputString; + +/* initialize an OutputString object, reserving size characters */ +static int +output_initialize(OutputString *output, Py_ssize_t size) +{ + output->obj = STRINGLIB_NEW(NULL, size); + if (output->obj == NULL) + return 0; + + output->ptr = STRINGLIB_STR(output->obj); + output->end = STRINGLIB_LEN(output->obj) + output->ptr; + output->size_increment = INITIAL_SIZE_INCREMENT; + + return 1; +} + +/* + output_extend reallocates the output string buffer. + It returns a status: 0 for a failed reallocation, + 1 for success. +*/ + +static int +output_extend(OutputString *output, Py_ssize_t count) +{ + STRINGLIB_CHAR *startptr = STRINGLIB_STR(output->obj); + Py_ssize_t curlen = output->ptr - startptr; + Py_ssize_t maxlen = curlen + count + output->size_increment; + + if (STRINGLIB_RESIZE(&output->obj, maxlen) < 0) + return 0; + startptr = STRINGLIB_STR(output->obj); + output->ptr = startptr + curlen; + output->end = startptr + maxlen; + if (output->size_increment < MAX_SIZE_INCREMENT) + output->size_increment *= SIZE_MULTIPLIER; + return 1; +} + +/* + output_data dumps characters into our output string + buffer. + + In some cases, it has to reallocate the string. + + It returns a status: 0 for a failed reallocation, + 1 for success. +*/ +static int +output_data(OutputString *output, const STRINGLIB_CHAR *s, Py_ssize_t count) +{ + if ((count > output->end - output->ptr) && !output_extend(output, count)) + return 0; + memcpy(output->ptr, s, count * sizeof(STRINGLIB_CHAR)); + output->ptr += count; + return 1; +} + +/************************************************************************/ /*********** Format string parsing -- integers and identifiers *********/ /************************************************************************/ @@ -110,14 +197,14 @@ get_integer(const SubString *str) { Py_ssize_t accumulator = 0; Py_ssize_t digitval; - Py_ssize_t i; + STRINGLIB_CHAR *p; /* empty string is an error */ - if (str->start >= str->end) + if (str->ptr >= str->end) return -1; - for (i = str->start; i < str->end; i++) { - digitval = Py_UNICODE_TODECIMAL(PyUnicode_READ_CHAR(str->str, i)); + for (p = str->ptr; p < str->end; p++) { + digitval = STRINGLIB_TODECIMAL(*p); if (digitval < 0) return -1; /* @@ -192,36 +279,34 @@ typedef struct { lifetime of the iterator. can be empty */ SubString str; - /* index to where we are inside field_name */ - Py_ssize_t index; + /* pointer to where we are inside field_name */ + STRINGLIB_CHAR *ptr; } FieldNameIterator; static int -FieldNameIterator_init(FieldNameIterator *self, PyObject *s, - Py_ssize_t start, Py_ssize_t end) +FieldNameIterator_init(FieldNameIterator *self, STRINGLIB_CHAR *ptr, + Py_ssize_t len) { - SubString_init(&self->str, s, start, end); - self->index = start; + SubString_init(&self->str, ptr, len); + self->ptr = self->str.ptr; return 1; } static int _FieldNameIterator_attr(FieldNameIterator *self, SubString *name) { - Py_UCS4 c; + STRINGLIB_CHAR c; - name->str = self->str.str; - name->start = self->index; + name->ptr = self->ptr; /* return everything until '.' or '[' */ - while (self->index < self->str.end) { - c = PyUnicode_READ_CHAR(self->str.str, self->index++); - switch (c) { + while (self->ptr < self->str.end) { + switch (c = *self->ptr++) { case '[': case '.': /* backup so that we this character will be seen next time */ - self->index--; + self->ptr--; break; default: continue; @@ -229,7 +314,7 @@ _FieldNameIterator_attr(FieldNameIterator *self, SubString *name) break; } /* end of string is okay */ - name->end = self->index; + name->end = self->ptr; return 1; } @@ -237,15 +322,13 @@ static int _FieldNameIterator_item(FieldNameIterator *self, SubString *name) { int bracket_seen = 0; - Py_UCS4 c; + STRINGLIB_CHAR c; - name->str = self->str.str; - name->start = self->index; + name->ptr = self->ptr; /* return everything until ']' */ - while (self->index < self->str.end) { - c = PyUnicode_READ_CHAR(self->str.str, self->index++); - switch (c) { + while (self->ptr < self->str.end) { + switch (c = *self->ptr++) { case ']': bracket_seen = 1; break; @@ -262,7 +345,7 @@ _FieldNameIterator_item(FieldNameIterator *self, SubString *name) /* end of string is okay */ /* don't include the ']' */ - name->end = self->index-1; + name->end = self->ptr-1; return 1; } @@ -272,10 +355,10 @@ FieldNameIterator_next(FieldNameIterator *self, int *is_attribute, Py_ssize_t *name_idx, SubString *name) { /* check at end of input */ - if (self->index >= self->str.end) + if (self->ptr >= self->str.end) return 1; - switch (PyUnicode_READ_CHAR(self->str.str, self->index++)) { + switch (*self->ptr++) { case '.': *is_attribute = 1; if (_FieldNameIterator_attr(self, name) == 0) @@ -298,7 +381,7 @@ FieldNameIterator_next(FieldNameIterator *self, int *is_attribute, } /* empty string is an error */ - if (name->start == name->end) { + if (name->ptr == name->end) { PyErr_SetString(PyExc_ValueError, "Empty attribute in format string"); return 0; } @@ -314,23 +397,24 @@ FieldNameIterator_next(FieldNameIterator *self, int *is_attribute, 'rest' is an iterator to return the rest */ static int -field_name_split(PyObject *str, Py_ssize_t start, Py_ssize_t end, SubString *first, +field_name_split(STRINGLIB_CHAR *ptr, Py_ssize_t len, SubString *first, Py_ssize_t *first_idx, FieldNameIterator *rest, AutoNumber *auto_number) { - Py_UCS4 c; - Py_ssize_t i = start; + STRINGLIB_CHAR c; + STRINGLIB_CHAR *p = ptr; + STRINGLIB_CHAR *end = ptr + len; int field_name_is_empty; int using_numeric_index; /* find the part up until the first '.' or '[' */ - while (i < end) { - switch (c = PyUnicode_READ_CHAR(str, i++)) { + while (p < end) { + switch (c = *p++) { case '[': case '.': /* backup so that we this character is available to the "rest" iterator */ - i--; + p--; break; default: continue; @@ -339,15 +423,15 @@ field_name_split(PyObject *str, Py_ssize_t start, Py_ssize_t end, SubString *fir } /* set up the return values */ - SubString_init(first, str, start, i); - FieldNameIterator_init(rest, str, i, end); + SubString_init(first, ptr, p - ptr); + FieldNameIterator_init(rest, p, end - p); /* see if "first" is an integer, in which case it's used as an index */ *first_idx = get_integer(first); if (*first_idx == -1 && PyErr_Occurred()) return 0; - field_name_is_empty = first->start >= first->end; + field_name_is_empty = first->ptr >= first->end; /* If the field name is omitted or if we have a numeric index specified, then we're doing numeric indexing into args. */ @@ -402,7 +486,7 @@ get_field_object(SubString *input, PyObject *args, PyObject *kwargs, Py_ssize_t index; FieldNameIterator rest; - if (!field_name_split(input->str, input->start, input->end, &first, + if (!field_name_split(input->ptr, input->end - input->ptr, &first, &index, &rest, auto_number)) { goto error; } @@ -410,43 +494,21 @@ get_field_object(SubString *input, PyObject *args, PyObject *kwargs, if (index == -1) { /* look up in kwargs */ PyObject *key = SubString_new_object(&first); - if (key == NULL) { + if (key == NULL) goto error; - } - if (kwargs == NULL) { + if ((kwargs == NULL) || (obj = PyDict_GetItem(kwargs, key)) == NULL) { PyErr_SetObject(PyExc_KeyError, key); Py_DECREF(key); goto error; } - /* Use PyObject_GetItem instead of PyDict_GetItem because this - code is no longer just used with kwargs. It might be passed - a non-dict when called through format_map. */ - obj = PyObject_GetItem(kwargs, key); Py_DECREF(key); - if (obj == NULL) { - goto error; - } + Py_INCREF(obj); } else { - /* If args is NULL, we have a format string with a positional field - with only kwargs to retrieve it from. This can only happen when - used with format_map(), where positional arguments are not - allowed. */ - if (args == NULL) { - PyErr_SetString(PyExc_ValueError, "Format string contains " - "positional fields"); - goto error; - } - /* look up in args */ obj = PySequence_GetItem(args, index); - if (obj == NULL) { - PyErr_Format(PyExc_IndexError, - "Replacement index %zd out of range for positional " - "args tuple", - index); - goto error; - } + if (obj == NULL) + goto error; } /* iterate over the rest of the field_name */ @@ -495,41 +557,48 @@ error: appends to the output. */ static int -render_field(PyObject *fieldobj, SubString *format_spec, _PyUnicodeWriter *writer) +render_field(PyObject *fieldobj, SubString *format_spec, OutputString *output) { int ok = 0; PyObject *result = NULL; PyObject *format_spec_object = NULL; - int (*formatter) (_PyUnicodeWriter*, PyObject *, PyObject *, Py_ssize_t, Py_ssize_t) = NULL; - int err; + PyObject *(*formatter)(PyObject *, STRINGLIB_CHAR *, Py_ssize_t) = NULL; + STRINGLIB_CHAR* format_spec_start = format_spec->ptr ? + format_spec->ptr : NULL; + Py_ssize_t format_spec_len = format_spec->ptr ? + format_spec->end - format_spec->ptr : 0; /* If we know the type exactly, skip the lookup of __format__ and just call the formatter directly. */ +#if STRINGLIB_IS_UNICODE if (PyUnicode_CheckExact(fieldobj)) - formatter = _PyUnicode_FormatAdvancedWriter; + formatter = _PyUnicode_FormatAdvanced; + /* Unfortunately, there's a problem with checking for int, long, + and float here. If we're being included as unicode, their + formatters expect string format_spec args. For now, just skip + this optimization for unicode. This could be fixed, but it's a + hassle. */ +#else + if (PyString_CheckExact(fieldobj)) + formatter = _PyBytes_FormatAdvanced; + else if (PyInt_CheckExact(fieldobj)) + formatter =_PyInt_FormatAdvanced; else if (PyLong_CheckExact(fieldobj)) - formatter = _PyLong_FormatAdvancedWriter; + formatter =_PyLong_FormatAdvanced; else if (PyFloat_CheckExact(fieldobj)) - formatter = _PyFloat_FormatAdvancedWriter; - else if (PyComplex_CheckExact(fieldobj)) - formatter = _PyComplex_FormatAdvancedWriter; + formatter = _PyFloat_FormatAdvanced; +#endif if (formatter) { /* we know exactly which formatter will be called when __format__ is looked up, so call it directly, instead. */ - err = formatter(writer, fieldobj, format_spec->str, - format_spec->start, format_spec->end); - return (err == 0); + result = formatter(fieldobj, format_spec_start, format_spec_len); } else { /* We need to create an object out of the pointers we have, because __format__ takes a string/unicode object for format_spec. */ - if (format_spec->str) - format_spec_object = PyUnicode_Substring(format_spec->str, - format_spec->start, - format_spec->end); - else - format_spec_object = PyUnicode_New(0, 0); + format_spec_object = STRINGLIB_NEW(format_spec_start, + format_spec_len); if (format_spec_object == NULL) goto done; @@ -538,10 +607,24 @@ render_field(PyObject *fieldobj, SubString *format_spec, _PyUnicodeWriter *write if (result == NULL) goto done; - if (_PyUnicodeWriter_WriteStr(writer, result) == -1) - goto done; - ok = 1; +#if PY_VERSION_HEX >= 0x03000000 + assert(PyUnicode_Check(result)); +#else + assert(PyString_Check(result) || PyUnicode_Check(result)); + + /* Convert result to our type. We could be str, and result could + be unicode */ + { + PyObject *tmp = STRINGLIB_TOSTR(result); + if (tmp == NULL) + goto done; + Py_DECREF(result); + result = tmp; + } +#endif + ok = output_data(output, + STRINGLIB_STR(result), STRINGLIB_LEN(result)); done: Py_XDECREF(format_spec_object); Py_XDECREF(result); @@ -550,33 +633,23 @@ done: static int parse_field(SubString *str, SubString *field_name, SubString *format_spec, - int *format_spec_needs_expanding, Py_UCS4 *conversion) + STRINGLIB_CHAR *conversion) { /* Note this function works if the field name is zero length, which is good. Zero length field names are handled later, in field_name_split. */ - Py_UCS4 c = 0; + STRINGLIB_CHAR c = 0; /* initialize these, as they may be empty */ *conversion = '\0'; - SubString_init(format_spec, NULL, 0, 0); + SubString_init(format_spec, NULL, 0); /* Search for the field name. it's terminated by the end of the string, or a ':' or '!' */ - field_name->str = str->str; - field_name->start = str->start; - while (str->start < str->end) { - switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) { - case '{': - PyErr_SetString(PyExc_ValueError, "unexpected '{' in field name"); - return 0; - case '[': - for (; str->start < str->end; str->start++) - if (PyUnicode_READ_CHAR(str->str, str->start) == ']') - break; - continue; - case '}': + field_name->ptr = str->ptr; + while (str->ptr < str->end) { + switch (c = *(str->ptr++)) { case ':': case '!': break; @@ -586,62 +659,40 @@ parse_field(SubString *str, SubString *field_name, SubString *format_spec, break; } - field_name->end = str->start - 1; if (c == '!' || c == ':') { - Py_ssize_t count; /* we have a format specifier and/or a conversion */ /* don't include the last character */ + field_name->end = str->ptr-1; + + /* the format specifier is the rest of the string */ + format_spec->ptr = str->ptr; + format_spec->end = str->end; /* see if there's a conversion specifier */ if (c == '!') { /* there must be another character present */ - if (str->start >= str->end) { + if (format_spec->ptr >= format_spec->end) { PyErr_SetString(PyExc_ValueError, - "end of string while looking for conversion " + "end of format while looking for conversion " "specifier"); return 0; } - *conversion = PyUnicode_READ_CHAR(str->str, str->start++); + *conversion = *(format_spec->ptr++); - if (str->start < str->end) { - c = PyUnicode_READ_CHAR(str->str, str->start++); - if (c == '}') - return 1; + /* if there is another character, it must be a colon */ + if (format_spec->ptr < format_spec->end) { + c = *(format_spec->ptr++); if (c != ':') { PyErr_SetString(PyExc_ValueError, - "expected ':' after conversion specifier"); + "expected ':' after format specifier"); return 0; } } } - format_spec->str = str->str; - format_spec->start = str->start; - count = 1; - while (str->start < str->end) { - switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) { - case '{': - *format_spec_needs_expanding = 1; - count++; - break; - case '}': - count--; - if (count == 0) { - format_spec->end = str->start - 1; - return 1; - } - break; - default: - break; - } - } - - PyErr_SetString(PyExc_ValueError, "unmatched '{' in format spec"); - return 0; - } - else if (c != '}') { - PyErr_SetString(PyExc_ValueError, "expected '}' before end of string"); - return 0; } + else + /* end of string, there's no format_spec or conversion */ + field_name->end = str->ptr; return 1; } @@ -660,10 +711,9 @@ typedef struct { } MarkupIterator; static int -MarkupIterator_init(MarkupIterator *self, PyObject *str, - Py_ssize_t start, Py_ssize_t end) +MarkupIterator_init(MarkupIterator *self, STRINGLIB_CHAR *ptr, Py_ssize_t len) { - SubString_init(&self->str, str, start, end); + SubString_init(&self->str, ptr, len); return 1; } @@ -672,29 +722,30 @@ MarkupIterator_init(MarkupIterator *self, PyObject *str, static int MarkupIterator_next(MarkupIterator *self, SubString *literal, int *field_present, SubString *field_name, - SubString *format_spec, Py_UCS4 *conversion, + SubString *format_spec, STRINGLIB_CHAR *conversion, int *format_spec_needs_expanding) { int at_end; - Py_UCS4 c = 0; - Py_ssize_t start; + STRINGLIB_CHAR c = 0; + STRINGLIB_CHAR *start; + int count; Py_ssize_t len; int markup_follows = 0; /* initialize all of the output variables */ - SubString_init(literal, NULL, 0, 0); - SubString_init(field_name, NULL, 0, 0); - SubString_init(format_spec, NULL, 0, 0); + SubString_init(literal, NULL, 0); + SubString_init(field_name, NULL, 0); + SubString_init(format_spec, NULL, 0); *conversion = '\0'; *format_spec_needs_expanding = 0; *field_present = 0; /* No more input, end of iterator. This is the normal exit path. */ - if (self->str.start >= self->str.end) + if (self->str.ptr >= self->str.end) return 1; - start = self->str.start; + start = self->str.ptr; /* First read any literal text. Read until the end of string, an escaped '{' or '}', or an unescaped '{'. In order to never @@ -703,8 +754,8 @@ MarkupIterator_next(MarkupIterator *self, SubString *literal, including the brace, but no format object. The next time through, we'll return the rest of the literal, skipping past the second consecutive brace. */ - while (self->str.start < self->str.end) { - switch (c = PyUnicode_READ_CHAR(self->str.str, self->str.start++)) { + while (self->str.ptr < self->str.end) { + switch (c = *(self->str.ptr++)) { case '{': case '}': markup_follows = 1; @@ -715,12 +766,10 @@ MarkupIterator_next(MarkupIterator *self, SubString *literal, break; } - at_end = self->str.start >= self->str.end; - len = self->str.start - start; + at_end = self->str.ptr >= self->str.end; + len = self->str.ptr - start; - if ((c == '}') && (at_end || - (c != PyUnicode_READ_CHAR(self->str.str, - self->str.start)))) { + if ((c == '}') && (at_end || (c != *self->str.ptr))) { PyErr_SetString(PyExc_ValueError, "Single '}' encountered " "in format string"); return 0; @@ -731,10 +780,10 @@ MarkupIterator_next(MarkupIterator *self, SubString *literal, return 0; } if (!at_end) { - if (c == PyUnicode_READ_CHAR(self->str.str, self->str.start)) { + if (c == *self->str.ptr) { /* escaped } or {, skip it in the input. there is no markup object following us, just this literal text */ - self->str.start++; + self->str.ptr++; markup_follows = 0; } else @@ -742,25 +791,56 @@ MarkupIterator_next(MarkupIterator *self, SubString *literal, } /* record the literal text */ - literal->str = self->str.str; - literal->start = start; + literal->ptr = start; literal->end = start + len; if (!markup_follows) return 2; - /* this is markup; parse the field */ + /* this is markup, find the end of the string by counting nested + braces. note that this prohibits escaped braces, so that + format_specs cannot have braces in them. */ *field_present = 1; - if (!parse_field(&self->str, field_name, format_spec, - format_spec_needs_expanding, conversion)) - return 0; - return 2; + count = 1; + + start = self->str.ptr; + + /* we know we can't have a zero length string, so don't worry + about that case */ + while (self->str.ptr < self->str.end) { + switch (c = *(self->str.ptr++)) { + case '{': + /* the format spec needs to be recursively expanded. + this is an optimization, and not strictly needed */ + *format_spec_needs_expanding = 1; + count++; + break; + case '}': + count--; + if (count <= 0) { + /* we're done. parse and get out */ + SubString s; + + SubString_init(&s, start, self->str.ptr - 1 - start); + if (parse_field(&s, field_name, format_spec, conversion) == 0) + return 0; + + /* success */ + return 2; + } + break; + } + } + + /* end of string while searching for matching '}' */ + PyErr_SetString(PyExc_ValueError, "unmatched '{' in format"); + return 0; } /* do the !r or !s conversion on obj */ static PyObject * -do_conversion(PyObject *obj, Py_UCS4 conversion) +do_conversion(PyObject *obj, STRINGLIB_CHAR conversion) { /* XXX in pre-3.0, do we need to convert this to unicode, since it might have returned a string? */ @@ -768,9 +848,7 @@ do_conversion(PyObject *obj, Py_UCS4 conversion) case 'r': return PyObject_Repr(obj); case 's': - return PyObject_Str(obj); - case 'a': - return PyObject_ASCII(obj); + return STRINGLIB_TOSTR(obj); default: if (conversion > 32 && conversion < 127) { /* It's the ASCII subrange; casting to char is safe @@ -802,8 +880,8 @@ do_conversion(PyObject *obj, Py_UCS4 conversion) static int output_markup(SubString *field_name, SubString *format_spec, - int format_spec_needs_expanding, Py_UCS4 conversion, - _PyUnicodeWriter *writer, PyObject *args, PyObject *kwargs, + int format_spec_needs_expanding, STRINGLIB_CHAR conversion, + OutputString *output, PyObject *args, PyObject *kwargs, int recursion_depth, AutoNumber *auto_number) { PyObject *tmp = NULL; @@ -819,7 +897,7 @@ output_markup(SubString *field_name, SubString *format_spec, if (conversion != '\0') { tmp = do_conversion(fieldobj, conversion); - if (tmp == NULL || PyUnicode_READY(tmp) == -1) + if (tmp == NULL) goto done; /* do the assignment, transferring ownership: fieldobj = tmp */ @@ -828,23 +906,24 @@ output_markup(SubString *field_name, SubString *format_spec, tmp = NULL; } - /* if needed, recursively compute the format_spec */ + /* if needed, recurively compute the format_spec */ if (format_spec_needs_expanding) { tmp = build_string(format_spec, args, kwargs, recursion_depth-1, auto_number); - if (tmp == NULL || PyUnicode_READY(tmp) == -1) + if (tmp == NULL) goto done; /* note that in the case we're expanding the format string, tmp must be kept around until after the call to render_field. */ - SubString_init(&expanded_format_spec, tmp, 0, PyUnicode_GET_LENGTH(tmp)); + SubString_init(&expanded_format_spec, + STRINGLIB_STR(tmp), STRINGLIB_LEN(tmp)); actual_format_spec = &expanded_format_spec; } else actual_format_spec = format_spec; - if (render_field(fieldobj, actual_format_spec, writer) == 0) + if (render_field(fieldobj, actual_format_spec, output) == 0) goto done; result = 1; @@ -864,7 +943,7 @@ done: */ static int do_markup(SubString *input, PyObject *args, PyObject *kwargs, - _PyUnicodeWriter *writer, int recursion_depth, AutoNumber *auto_number) + OutputString *output, int recursion_depth, AutoNumber *auto_number) { MarkupIterator iter; int format_spec_needs_expanding; @@ -873,29 +952,20 @@ do_markup(SubString *input, PyObject *args, PyObject *kwargs, SubString literal; SubString field_name; SubString format_spec; - Py_UCS4 conversion; + STRINGLIB_CHAR conversion; - MarkupIterator_init(&iter, input->str, input->start, input->end); + MarkupIterator_init(&iter, input->ptr, input->end - input->ptr); while ((result = MarkupIterator_next(&iter, &literal, &field_present, &field_name, &format_spec, &conversion, &format_spec_needs_expanding)) == 2) { - if (literal.end != literal.start) { - if (!field_present && iter.str.start == iter.str.end) - writer->overallocate = 0; - if (_PyUnicodeWriter_WriteSubstring(writer, literal.str, - literal.start, literal.end) < 0) - return 0; - } - - if (field_present) { - if (iter.str.start == iter.str.end) - writer->overallocate = 0; + if (!output_data(output, literal.ptr, literal.end - literal.ptr)) + return 0; + if (field_present) if (!output_markup(&field_name, &format_spec, - format_spec_needs_expanding, conversion, writer, + format_spec_needs_expanding, conversion, output, args, kwargs, recursion_depth, auto_number)) return 0; - } } return result; } @@ -909,26 +979,43 @@ static PyObject * build_string(SubString *input, PyObject *args, PyObject *kwargs, int recursion_depth, AutoNumber *auto_number) { - _PyUnicodeWriter writer; + OutputString output; + PyObject *result = NULL; + Py_ssize_t count; + + output.obj = NULL; /* needed so cleanup code always works */ /* check the recursion level */ if (recursion_depth <= 0) { PyErr_SetString(PyExc_ValueError, "Max string recursion exceeded"); - return NULL; + goto done; } - _PyUnicodeWriter_Init(&writer); - writer.overallocate = 1; - writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100; + /* initial size is the length of the format string, plus the size + increment. seems like a reasonable default */ + if (!output_initialize(&output, + input->end - input->ptr + + INITIAL_SIZE_INCREMENT)) + goto done; - if (!do_markup(input, args, kwargs, &writer, recursion_depth, + if (!do_markup(input, args, kwargs, &output, recursion_depth, auto_number)) { - _PyUnicodeWriter_Dealloc(&writer); - return NULL; + goto done; + } + + count = output.ptr - STRINGLIB_STR(output.obj); + if (STRINGLIB_RESIZE(&output.obj, count) < 0) { + goto done; } - return _PyUnicodeWriter_Finish(&writer); + /* transfer ownership to result */ + result = output.obj; + output.obj = NULL; + +done: + Py_XDECREF(output.obj); + return result; } /************************************************************************/ @@ -949,19 +1036,11 @@ do_string_format(PyObject *self, PyObject *args, PyObject *kwargs) AutoNumber auto_number; - if (PyUnicode_READY(self) == -1) - return NULL; - AutoNumber_Init(&auto_number); - SubString_init(&input, self, 0, PyUnicode_GET_LENGTH(self)); + SubString_init(&input, STRINGLIB_STR(self), STRINGLIB_LEN(self)); return build_string(&input, args, kwargs, recursion_depth, &auto_number); } -static PyObject * -do_string_format_map(PyObject *self, PyObject *obj) -{ - return do_string_format(self, NULL, obj); -} /************************************************************************/ @@ -975,7 +1054,9 @@ do_string_format_map(PyObject *self, PyObject *obj) typedef struct { PyObject_HEAD - PyObject *str; + + STRINGLIB_OBJECT *str; + MarkupIterator it_markup; } formatteriterobject; @@ -1000,7 +1081,7 @@ formatteriter_next(formatteriterobject *it) SubString literal; SubString field_name; SubString format_spec; - Py_UCS4 conversion; + STRINGLIB_CHAR conversion; int format_spec_needs_expanding; int field_present; int result = MarkupIterator_next(&it->it_markup, &literal, &field_present, @@ -1044,8 +1125,7 @@ formatteriter_next(formatteriterobject *it) Py_INCREF(conversion_str); } else - conversion_str = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, - &conversion, 1); + conversion_str = STRINGLIB_NEW(&conversion, 1); if (conversion_str == NULL) goto done; @@ -1071,10 +1151,10 @@ static PyTypeObject PyFormatterIter_Type = { 0, /* tp_itemsize */ /* methods */ (destructor)formatteriter_dealloc, /* tp_dealloc */ - 0, /* tp_vectorcall_offset */ + 0, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ - 0, /* tp_as_async */ + 0, /* tp_compare */ 0, /* tp_repr */ 0, /* tp_as_number */ 0, /* tp_as_sequence */ @@ -1102,18 +1182,10 @@ static PyTypeObject PyFormatterIter_Type = { describing the parsed elements. It's a wrapper around stringlib/string_format.h's MarkupIterator */ static PyObject * -formatter_parser(PyObject *ignored, PyObject *self) +formatter_parser(STRINGLIB_OBJECT *self) { formatteriterobject *it; - if (!PyUnicode_Check(self)) { - PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name); - return NULL; - } - - if (PyUnicode_READY(self) == -1) - return NULL; - it = PyObject_New(formatteriterobject, &PyFormatterIter_Type); if (it == NULL) return NULL; @@ -1123,7 +1195,10 @@ formatter_parser(PyObject *ignored, PyObject *self) it->str = self; /* initialize the contained MarkupIterator */ - MarkupIterator_init(&it->it_markup, (PyObject*)self, 0, PyUnicode_GET_LENGTH(self)); + MarkupIterator_init(&it->it_markup, + STRINGLIB_STR(self), + STRINGLIB_LEN(self)); + return (PyObject *)it; } @@ -1139,7 +1214,9 @@ formatter_parser(PyObject *ignored, PyObject *self) typedef struct { PyObject_HEAD - PyObject *str; + + STRINGLIB_OBJECT *str; + FieldNameIterator it_field; } fieldnameiterobject; @@ -1207,10 +1284,10 @@ static PyTypeObject PyFieldNameIter_Type = { 0, /* tp_itemsize */ /* methods */ (destructor)fieldnameiter_dealloc, /* tp_dealloc */ - 0, /* tp_vectorcall_offset */ + 0, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ - 0, /* tp_as_async */ + 0, /* tp_compare */ 0, /* tp_repr */ 0, /* tp_as_number */ 0, /* tp_as_sequence */ @@ -1240,7 +1317,7 @@ static PyTypeObject PyFieldNameIter_Type = { field_name_split. The iterator it returns is a FieldNameIterator */ static PyObject * -formatter_field_name_split(PyObject *ignored, PyObject *self) +formatter_field_name_split(STRINGLIB_OBJECT *self) { SubString first; Py_ssize_t first_idx; @@ -1249,14 +1326,6 @@ formatter_field_name_split(PyObject *ignored, PyObject *self) PyObject *first_obj = NULL; PyObject *result = NULL; - if (!PyUnicode_Check(self)) { - PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name); - return NULL; - } - - if (PyUnicode_READY(self) == -1) - return NULL; - it = PyObject_New(fieldnameiterobject, &PyFieldNameIter_Type); if (it == NULL) return NULL; @@ -1268,7 +1337,8 @@ formatter_field_name_split(PyObject *ignored, PyObject *self) /* Pass in auto_number = NULL. We'll return an empty string for first_obj in that case. */ - if (!field_name_split((PyObject*)self, 0, PyUnicode_GET_LENGTH(self), + if (!field_name_split(STRINGLIB_STR(self), + STRINGLIB_LEN(self), &first, &first_idx, &it->it_field, NULL)) goto done; diff --git a/Objects/stringlib/stringdefs.h b/Objects/stringlib/stringdefs.h index ce27f3e..84e4616 100644 --- a/Objects/stringlib/stringdefs.h +++ b/Objects/stringlib/stringdefs.h @@ -6,10 +6,7 @@ compiled as unicode. */ #define STRINGLIB_IS_UNICODE 0 -#define FASTSEARCH fastsearch -#define STRINGLIB(F) stringlib_##F -#define STRINGLIB_OBJECT PyBytesObject -#define STRINGLIB_SIZEOF_CHAR 1 +#define STRINGLIB_OBJECT PyStringObject #define STRINGLIB_CHAR char #define STRINGLIB_TYPE_NAME "string" #define STRINGLIB_PARSE_CODE "S" @@ -18,11 +15,19 @@ #define STRINGLIB_ISLINEBREAK(x) ((x == '\n') || (x == '\r')) #define STRINGLIB_ISDECIMAL(x) ((x >= '0') && (x <= '9')) #define STRINGLIB_TODECIMAL(x) (STRINGLIB_ISDECIMAL(x) ? (x - '0') : -1) -#define STRINGLIB_STR PyBytes_AS_STRING -#define STRINGLIB_LEN PyBytes_GET_SIZE -#define STRINGLIB_NEW PyBytes_FromStringAndSize -#define STRINGLIB_CHECK PyBytes_Check -#define STRINGLIB_CHECK_EXACT PyBytes_CheckExact +#define STRINGLIB_TOUPPER Py_TOUPPER +#define STRINGLIB_TOLOWER Py_TOLOWER +#define STRINGLIB_FILL memset +#define STRINGLIB_STR PyString_AS_STRING +#define STRINGLIB_LEN PyString_GET_SIZE +#define STRINGLIB_NEW PyString_FromStringAndSize +#define STRINGLIB_RESIZE _PyString_Resize +#define STRINGLIB_CHECK PyString_Check +#define STRINGLIB_CHECK_EXACT PyString_CheckExact #define STRINGLIB_TOSTR PyObject_Str -#define STRINGLIB_TOASCII PyObject_Repr +#define STRINGLIB_GROUPING _PyString_InsertThousandsGrouping +#define STRINGLIB_GROUPING_LOCALE _PyString_InsertThousandsGroupingLocale + +#define STRINGLIB_WANT_CONTAINS_OBJ 1 + #endif /* !STRINGLIB_STRINGDEFS_H */ diff --git a/Objects/stringlib/transmogrify.h b/Objects/stringlib/transmogrify.h index e1165ea..be595a6 100644 --- a/Objects/stringlib/transmogrify.h +++ b/Objects/stringlib/transmogrify.h @@ -1,48 +1,27 @@ -#if STRINGLIB_IS_UNICODE -# error "transmogrify.h only compatible with byte-wise strings" -#endif +/* NOTE: this API is -ONLY- for use with single byte character strings. */ +/* Do not use it with Unicode. */ /* the more complicated methods. parts of these should be pulled out into the shared code in bytes_methods.c to cut down on duplicate code bloat. */ -/*[clinic input] -class B "PyObject *" "&PyType_Type" -[clinic start generated code]*/ -/*[clinic end generated code: output=da39a3ee5e6b4b0d input=2935558188d97c76]*/ - -#include "clinic/transmogrify.h.h" - -static inline PyObject * -return_self(PyObject *self) -{ -#if !STRINGLIB_MUTABLE - if (STRINGLIB_CHECK_EXACT(self)) { - Py_INCREF(self); - return self; - } -#endif - return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self)); -} - -/*[clinic input] -B.expandtabs as stringlib_expandtabs - - tabsize: int = 8 - -Return a copy where all tab characters are expanded using spaces. +PyDoc_STRVAR(expandtabs__doc__, +"B.expandtabs([tabsize]) -> copy of B\n\ +\n\ +Return a copy of B where all tab characters are expanded using spaces.\n\ +If tabsize is not given, a tab size of 8 characters is assumed."); -If tabsize is not given, a tab size of 8 characters is assumed. -[clinic start generated code]*/ - -static PyObject * -stringlib_expandtabs_impl(PyObject *self, int tabsize) -/*[clinic end generated code: output=069cb7fae72e4c2b input=3c6d3b12aa3ccbea]*/ +static PyObject* +stringlib_expandtabs(PyObject *self, PyObject *args) { const char *e, *p; char *q; Py_ssize_t i, j; PyObject *u; - + int tabsize = 8; + + if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) + return NULL; + /* First pass: determine size of output string */ i = j = 0; e = STRINGLIB_STR(self) + STRINGLIB_LEN(self); @@ -67,18 +46,18 @@ stringlib_expandtabs_impl(PyObject *self, int tabsize) } } } - + if (i > PY_SSIZE_T_MAX - j) goto overflow; - + /* Second pass: create output string and fill it */ u = STRINGLIB_NEW(NULL, i + j); if (!u) return NULL; - + j = 0; q = STRINGLIB_STR(u); - + for (p = STRINGLIB_STR(self); p < e; p++) { if (*p == '\t') { if (tabsize > 0) { @@ -102,7 +81,7 @@ stringlib_expandtabs_impl(PyObject *self, int tabsize) return NULL; } -static inline PyObject * +Py_LOCAL_INLINE(PyObject *) pad(PyObject *self, Py_ssize_t left, Py_ssize_t right, char fill) { PyObject *u; @@ -112,93 +91,118 @@ pad(PyObject *self, Py_ssize_t left, Py_ssize_t right, char fill) if (right < 0) right = 0; - if (left == 0 && right == 0) { - return return_self(self); + if (left == 0 && right == 0 && STRINGLIB_CHECK_EXACT(self)) { +#if STRINGLIB_MUTABLE + /* We're defined as returning a copy; If the object is mutable + * that means we must make an identical copy. */ + return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self)); +#else + Py_INCREF(self); + return (PyObject *)self; +#endif /* STRINGLIB_MUTABLE */ } - u = STRINGLIB_NEW(NULL, left + STRINGLIB_LEN(self) + right); + u = STRINGLIB_NEW(NULL, + left + STRINGLIB_LEN(self) + right); if (u) { if (left) memset(STRINGLIB_STR(u), fill, left); - memcpy(STRINGLIB_STR(u) + left, - STRINGLIB_STR(self), - STRINGLIB_LEN(self)); + Py_MEMCPY(STRINGLIB_STR(u) + left, + STRINGLIB_STR(self), + STRINGLIB_LEN(self)); if (right) memset(STRINGLIB_STR(u) + left + STRINGLIB_LEN(self), - fill, right); + fill, right); } return u; } -/*[clinic input] -B.ljust as stringlib_ljust - - width: Py_ssize_t - fillchar: char = b' ' - / - -Return a left-justified string of length width. - -Padding is done using the specified fill character. -[clinic start generated code]*/ +PyDoc_STRVAR(ljust__doc__, +"B.ljust(width[, fillchar]) -> copy of B\n" +"\n" +"Return B left justified in a string of length width. Padding is\n" +"done using the specified fill character (default is a space)."); static PyObject * -stringlib_ljust_impl(PyObject *self, Py_ssize_t width, char fillchar) -/*[clinic end generated code: output=c79ca173c5ff8337 input=eff2d014bc7d80df]*/ +stringlib_ljust(PyObject *self, PyObject *args) { - if (STRINGLIB_LEN(self) >= width) { - return return_self(self); + Py_ssize_t width; + char fillchar = ' '; + + if (!PyArg_ParseTuple(args, "n|c:ljust", &width, &fillchar)) + return NULL; + + if (STRINGLIB_LEN(self) >= width && STRINGLIB_CHECK_EXACT(self)) { +#if STRINGLIB_MUTABLE + /* We're defined as returning a copy; If the object is mutable + * that means we must make an identical copy. */ + return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self)); +#else + Py_INCREF(self); + return (PyObject*) self; +#endif } return pad(self, 0, width - STRINGLIB_LEN(self), fillchar); } -/*[clinic input] -B.rjust as stringlib_rjust - - width: Py_ssize_t - fillchar: char = b' ' - / - -Return a right-justified string of length width. - -Padding is done using the specified fill character. -[clinic start generated code]*/ +PyDoc_STRVAR(rjust__doc__, +"B.rjust(width[, fillchar]) -> copy of B\n" +"\n" +"Return B right justified in a string of length width. Padding is\n" +"done using the specified fill character (default is a space)"); static PyObject * -stringlib_rjust_impl(PyObject *self, Py_ssize_t width, char fillchar) -/*[clinic end generated code: output=7df5d728a5439570 input=218b0bd31308955d]*/ +stringlib_rjust(PyObject *self, PyObject *args) { - if (STRINGLIB_LEN(self) >= width) { - return return_self(self); + Py_ssize_t width; + char fillchar = ' '; + + if (!PyArg_ParseTuple(args, "n|c:rjust", &width, &fillchar)) + return NULL; + + if (STRINGLIB_LEN(self) >= width && STRINGLIB_CHECK_EXACT(self)) { +#if STRINGLIB_MUTABLE + /* We're defined as returning a copy; If the object is mutable + * that means we must make an identical copy. */ + return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self)); +#else + Py_INCREF(self); + return (PyObject*) self; +#endif } return pad(self, width - STRINGLIB_LEN(self), 0, fillchar); } -/*[clinic input] -B.center as stringlib_center - - width: Py_ssize_t - fillchar: char = b' ' - / - -Return a centered string of length width. - -Padding is done using the specified fill character. -[clinic start generated code]*/ +PyDoc_STRVAR(center__doc__, +"B.center(width[, fillchar]) -> copy of B\n" +"\n" +"Return B centered in a string of length width. Padding is\n" +"done using the specified fill character (default is a space)."); static PyObject * -stringlib_center_impl(PyObject *self, Py_ssize_t width, char fillchar) -/*[clinic end generated code: output=d8da2e055288b4c2 input=3776fd278765d89b]*/ +stringlib_center(PyObject *self, PyObject *args) { Py_ssize_t marg, left; + Py_ssize_t width; + char fillchar = ' '; - if (STRINGLIB_LEN(self) >= width) { - return return_self(self); + if (!PyArg_ParseTuple(args, "n|c:center", &width, &fillchar)) + return NULL; + + if (STRINGLIB_LEN(self) >= width && STRINGLIB_CHECK_EXACT(self)) { +#if STRINGLIB_MUTABLE + /* We're defined as returning a copy; If the object is mutable + * that means we must make an identical copy. */ + return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self)); +#else + Py_INCREF(self); + return (PyObject*) self; +#endif } marg = width - STRINGLIB_LEN(self); @@ -207,27 +211,39 @@ stringlib_center_impl(PyObject *self, Py_ssize_t width, char fillchar) return pad(self, left, marg - left, fillchar); } -/*[clinic input] -B.zfill as stringlib_zfill - - width: Py_ssize_t - / - -Pad a numeric string with zeros on the left, to fill a field of the given width. - -The original string is never truncated. -[clinic start generated code]*/ +PyDoc_STRVAR(zfill__doc__, +"B.zfill(width) -> copy of B\n" +"\n" +"Pad a numeric string B with zeros on the left, to fill a field\n" +"of the specified width. B is never truncated."); static PyObject * -stringlib_zfill_impl(PyObject *self, Py_ssize_t width) -/*[clinic end generated code: output=0b3c684a7f1b2319 input=2da6d7b8e9bcb19a]*/ +stringlib_zfill(PyObject *self, PyObject *args) { Py_ssize_t fill; PyObject *s; char *p; + Py_ssize_t width; + + if (!PyArg_ParseTuple(args, "n:zfill", &width)) + return NULL; if (STRINGLIB_LEN(self) >= width) { - return return_self(self); + if (STRINGLIB_CHECK_EXACT(self)) { +#if STRINGLIB_MUTABLE + /* We're defined as returning a copy; If the object is mutable + * that means we must make an identical copy. */ + return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self)); +#else + Py_INCREF(self); + return (PyObject*) self; +#endif + } + else + return STRINGLIB_NEW( + STRINGLIB_STR(self), + STRINGLIB_LEN(self) + ); } fill = width - STRINGLIB_LEN(self); @@ -244,497 +260,5 @@ stringlib_zfill_impl(PyObject *self, Py_ssize_t width) p[fill] = '0'; } - return s; -} - - -/* find and count characters and substrings */ - -#define findchar(target, target_len, c) \ - ((char *)memchr((const void *)(target), c, target_len)) - - -static Py_ssize_t -countchar(const char *target, Py_ssize_t target_len, char c, - Py_ssize_t maxcount) -{ - Py_ssize_t count = 0; - const char *start = target; - const char *end = target + target_len; - - while ((start = findchar(start, end - start, c)) != NULL) { - count++; - if (count >= maxcount) - break; - start += 1; - } - return count; -} - - -/* Algorithms for different cases of string replacement */ - -/* len(self)>=1, from="", len(to)>=1, maxcount>=1 */ -static PyObject * -stringlib_replace_interleave(PyObject *self, - const char *to_s, Py_ssize_t to_len, - Py_ssize_t maxcount) -{ - const char *self_s; - char *result_s; - Py_ssize_t self_len, result_len; - Py_ssize_t count, i; - PyObject *result; - - self_len = STRINGLIB_LEN(self); - - /* 1 at the end plus 1 after every character; - count = min(maxcount, self_len + 1) */ - if (maxcount <= self_len) { - count = maxcount; - } - else { - /* Can't overflow: self_len + 1 <= maxcount <= PY_SSIZE_T_MAX. */ - count = self_len + 1; - } - - /* Check for overflow */ - /* result_len = count * to_len + self_len; */ - assert(count > 0); - if (to_len > (PY_SSIZE_T_MAX - self_len) / count) { - PyErr_SetString(PyExc_OverflowError, - "replace bytes is too long"); - return NULL; - } - result_len = count * to_len + self_len; - result = STRINGLIB_NEW(NULL, result_len); - if (result == NULL) { - return NULL; - } - - self_s = STRINGLIB_STR(self); - result_s = STRINGLIB_STR(result); - - if (to_len > 1) { - /* Lay the first one down (guaranteed this will occur) */ - memcpy(result_s, to_s, to_len); - result_s += to_len; - count -= 1; - - for (i = 0; i < count; i++) { - *result_s++ = *self_s++; - memcpy(result_s, to_s, to_len); - result_s += to_len; - } - } - else { - result_s[0] = to_s[0]; - result_s += to_len; - count -= 1; - for (i = 0; i < count; i++) { - *result_s++ = *self_s++; - result_s[0] = to_s[0]; - result_s += to_len; - } - } - - /* Copy the rest of the original string */ - memcpy(result_s, self_s, self_len - i); - - return result; -} - -/* Special case for deleting a single character */ -/* len(self)>=1, len(from)==1, to="", maxcount>=1 */ -static PyObject * -stringlib_replace_delete_single_character(PyObject *self, - char from_c, Py_ssize_t maxcount) -{ - const char *self_s, *start, *next, *end; - char *result_s; - Py_ssize_t self_len, result_len; - Py_ssize_t count; - PyObject *result; - - self_len = STRINGLIB_LEN(self); - self_s = STRINGLIB_STR(self); - - count = countchar(self_s, self_len, from_c, maxcount); - if (count == 0) { - return return_self(self); - } - - result_len = self_len - count; /* from_len == 1 */ - assert(result_len>=0); - - result = STRINGLIB_NEW(NULL, result_len); - if (result == NULL) { - return NULL; - } - result_s = STRINGLIB_STR(result); - - start = self_s; - end = self_s + self_len; - while (count-- > 0) { - next = findchar(start, end - start, from_c); - if (next == NULL) - break; - memcpy(result_s, start, next - start); - result_s += (next - start); - start = next + 1; - } - memcpy(result_s, start, end - start); - - return result; -} - -/* len(self)>=1, len(from)>=2, to="", maxcount>=1 */ - -static PyObject * -stringlib_replace_delete_substring(PyObject *self, - const char *from_s, Py_ssize_t from_len, - Py_ssize_t maxcount) -{ - const char *self_s, *start, *next, *end; - char *result_s; - Py_ssize_t self_len, result_len; - Py_ssize_t count, offset; - PyObject *result; - - self_len = STRINGLIB_LEN(self); - self_s = STRINGLIB_STR(self); - - count = stringlib_count(self_s, self_len, - from_s, from_len, - maxcount); - - if (count == 0) { - /* no matches */ - return return_self(self); - } - - result_len = self_len - (count * from_len); - assert (result_len>=0); - - result = STRINGLIB_NEW(NULL, result_len); - if (result == NULL) { - return NULL; - } - result_s = STRINGLIB_STR(result); - - start = self_s; - end = self_s + self_len; - while (count-- > 0) { - offset = stringlib_find(start, end - start, - from_s, from_len, - 0); - if (offset == -1) - break; - next = start + offset; - - memcpy(result_s, start, next - start); - - result_s += (next - start); - start = next + from_len; - } - memcpy(result_s, start, end - start); - return result; -} - -/* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */ -static PyObject * -stringlib_replace_single_character_in_place(PyObject *self, - char from_c, char to_c, - Py_ssize_t maxcount) -{ - const char *self_s, *end; - char *result_s, *start, *next; - Py_ssize_t self_len; - PyObject *result; - - /* The result string will be the same size */ - self_s = STRINGLIB_STR(self); - self_len = STRINGLIB_LEN(self); - - next = findchar(self_s, self_len, from_c); - - if (next == NULL) { - /* No matches; return the original bytes */ - return return_self(self); - } - - /* Need to make a new bytes */ - result = STRINGLIB_NEW(NULL, self_len); - if (result == NULL) { - return NULL; - } - result_s = STRINGLIB_STR(result); - memcpy(result_s, self_s, self_len); - - /* change everything in-place, starting with this one */ - start = result_s + (next - self_s); - *start = to_c; - start++; - end = result_s + self_len; - - while (--maxcount > 0) { - next = findchar(start, end - start, from_c); - if (next == NULL) - break; - *next = to_c; - start = next + 1; - } - - return result; -} - -/* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */ -static PyObject * -stringlib_replace_substring_in_place(PyObject *self, - const char *from_s, Py_ssize_t from_len, - const char *to_s, Py_ssize_t to_len, - Py_ssize_t maxcount) -{ - const char *self_s, *end; - char *result_s, *start; - Py_ssize_t self_len, offset; - PyObject *result; - - /* The result bytes will be the same size */ - - self_s = STRINGLIB_STR(self); - self_len = STRINGLIB_LEN(self); - - offset = stringlib_find(self_s, self_len, - from_s, from_len, - 0); - if (offset == -1) { - /* No matches; return the original bytes */ - return return_self(self); - } - - /* Need to make a new bytes */ - result = STRINGLIB_NEW(NULL, self_len); - if (result == NULL) { - return NULL; - } - result_s = STRINGLIB_STR(result); - memcpy(result_s, self_s, self_len); - - /* change everything in-place, starting with this one */ - start = result_s + offset; - memcpy(start, to_s, from_len); - start += from_len; - end = result_s + self_len; - - while ( --maxcount > 0) { - offset = stringlib_find(start, end - start, - from_s, from_len, - 0); - if (offset == -1) - break; - memcpy(start + offset, to_s, from_len); - start += offset + from_len; - } - - return result; -} - -/* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */ -static PyObject * -stringlib_replace_single_character(PyObject *self, - char from_c, - const char *to_s, Py_ssize_t to_len, - Py_ssize_t maxcount) -{ - const char *self_s, *start, *next, *end; - char *result_s; - Py_ssize_t self_len, result_len; - Py_ssize_t count; - PyObject *result; - - self_s = STRINGLIB_STR(self); - self_len = STRINGLIB_LEN(self); - - count = countchar(self_s, self_len, from_c, maxcount); - if (count == 0) { - /* no matches, return unchanged */ - return return_self(self); - } - - /* use the difference between current and new, hence the "-1" */ - /* result_len = self_len + count * (to_len-1) */ - assert(count > 0); - if (to_len - 1 > (PY_SSIZE_T_MAX - self_len) / count) { - PyErr_SetString(PyExc_OverflowError, "replace bytes is too long"); - return NULL; - } - result_len = self_len + count * (to_len - 1); - - result = STRINGLIB_NEW(NULL, result_len); - if (result == NULL) { - return NULL; - } - result_s = STRINGLIB_STR(result); - - start = self_s; - end = self_s + self_len; - while (count-- > 0) { - next = findchar(start, end - start, from_c); - if (next == NULL) - break; - - if (next == start) { - /* replace with the 'to' */ - memcpy(result_s, to_s, to_len); - result_s += to_len; - start += 1; - } else { - /* copy the unchanged old then the 'to' */ - memcpy(result_s, start, next - start); - result_s += (next - start); - memcpy(result_s, to_s, to_len); - result_s += to_len; - start = next + 1; - } - } - /* Copy the remainder of the remaining bytes */ - memcpy(result_s, start, end - start); - - return result; -} - -/* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */ -static PyObject * -stringlib_replace_substring(PyObject *self, - const char *from_s, Py_ssize_t from_len, - const char *to_s, Py_ssize_t to_len, - Py_ssize_t maxcount) -{ - const char *self_s, *start, *next, *end; - char *result_s; - Py_ssize_t self_len, result_len; - Py_ssize_t count, offset; - PyObject *result; - - self_s = STRINGLIB_STR(self); - self_len = STRINGLIB_LEN(self); - - count = stringlib_count(self_s, self_len, - from_s, from_len, - maxcount); - - if (count == 0) { - /* no matches, return unchanged */ - return return_self(self); - } - - /* Check for overflow */ - /* result_len = self_len + count * (to_len-from_len) */ - assert(count > 0); - if (to_len - from_len > (PY_SSIZE_T_MAX - self_len) / count) { - PyErr_SetString(PyExc_OverflowError, "replace bytes is too long"); - return NULL; - } - result_len = self_len + count * (to_len - from_len); - - result = STRINGLIB_NEW(NULL, result_len); - if (result == NULL) { - return NULL; - } - result_s = STRINGLIB_STR(result); - - start = self_s; - end = self_s + self_len; - while (count-- > 0) { - offset = stringlib_find(start, end - start, - from_s, from_len, - 0); - if (offset == -1) - break; - next = start + offset; - if (next == start) { - /* replace with the 'to' */ - memcpy(result_s, to_s, to_len); - result_s += to_len; - start += from_len; - } else { - /* copy the unchanged old then the 'to' */ - memcpy(result_s, start, next - start); - result_s += (next - start); - memcpy(result_s, to_s, to_len); - result_s += to_len; - start = next + from_len; - } - } - /* Copy the remainder of the remaining bytes */ - memcpy(result_s, start, end - start); - - return result; -} - - -static PyObject * -stringlib_replace(PyObject *self, - const char *from_s, Py_ssize_t from_len, - const char *to_s, Py_ssize_t to_len, - Py_ssize_t maxcount) -{ - if (STRINGLIB_LEN(self) < from_len) { - /* nothing to do; return the original bytes */ - return return_self(self); - } - if (maxcount < 0) { - maxcount = PY_SSIZE_T_MAX; - } else if (maxcount == 0) { - /* nothing to do; return the original bytes */ - return return_self(self); - } - - /* Handle zero-length special cases */ - if (from_len == 0) { - if (to_len == 0) { - /* nothing to do; return the original bytes */ - return return_self(self); - } - /* insert the 'to' bytes everywhere. */ - /* >>> b"Python".replace(b"", b".") */ - /* b'.P.y.t.h.o.n.' */ - return stringlib_replace_interleave(self, to_s, to_len, maxcount); - } - - if (to_len == 0) { - /* delete all occurrences of 'from' bytes */ - if (from_len == 1) { - return stringlib_replace_delete_single_character( - self, from_s[0], maxcount); - } else { - return stringlib_replace_delete_substring( - self, from_s, from_len, maxcount); - } - } - - /* Handle special case where both bytes have the same length */ - - if (from_len == to_len) { - if (from_len == 1) { - return stringlib_replace_single_character_in_place( - self, from_s[0], to_s[0], maxcount); - } else { - return stringlib_replace_substring_in_place( - self, from_s, from_len, to_s, to_len, maxcount); - } - } - - /* Otherwise use the more generic algorithms */ - if (from_len == 1) { - return stringlib_replace_single_character( - self, from_s[0], to_s, to_len, maxcount); - } else { - /* len('from')>=2, len('to')>=1 */ - return stringlib_replace_substring( - self, from_s, from_len, to_s, to_len, maxcount); - } + return (PyObject*) s; } - -#undef findchar diff --git a/Objects/stringlib/ucs1lib.h b/Objects/stringlib/ucs1lib.h deleted file mode 100644 index ce1eb57..0000000 --- a/Objects/stringlib/ucs1lib.h +++ /dev/null @@ -1,30 +0,0 @@ -/* this is sort of a hack. there's at least one place (formatting - floats) where some stringlib code takes a different path if it's - compiled as unicode. */ -#define STRINGLIB_IS_UNICODE 1 - -#define FASTSEARCH ucs1lib_fastsearch -#define STRINGLIB(F) ucs1lib_##F -#define STRINGLIB_OBJECT PyUnicodeObject -#define STRINGLIB_SIZEOF_CHAR 1 -#define STRINGLIB_MAX_CHAR 0xFFu -#define STRINGLIB_CHAR Py_UCS1 -#define STRINGLIB_TYPE_NAME "unicode" -#define STRINGLIB_PARSE_CODE "U" -#define STRINGLIB_EMPTY unicode_empty -#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE -#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK -#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL -#define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL -#define STRINGLIB_STR PyUnicode_1BYTE_DATA -#define STRINGLIB_LEN PyUnicode_GET_LENGTH -#define STRINGLIB_NEW _PyUnicode_FromUCS1 -#define STRINGLIB_CHECK PyUnicode_Check -#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact - -#define STRINGLIB_TOSTR PyObject_Str -#define STRINGLIB_TOASCII PyObject_ASCII - -#define _Py_InsertThousandsGrouping _PyUnicode_ucs1_InsertThousandsGrouping - - diff --git a/Objects/stringlib/ucs2lib.h b/Objects/stringlib/ucs2lib.h deleted file mode 100644 index f900cb6..0000000 --- a/Objects/stringlib/ucs2lib.h +++ /dev/null @@ -1,29 +0,0 @@ -/* this is sort of a hack. there's at least one place (formatting - floats) where some stringlib code takes a different path if it's - compiled as unicode. */ -#define STRINGLIB_IS_UNICODE 1 - -#define FASTSEARCH ucs2lib_fastsearch -#define STRINGLIB(F) ucs2lib_##F -#define STRINGLIB_OBJECT PyUnicodeObject -#define STRINGLIB_SIZEOF_CHAR 2 -#define STRINGLIB_MAX_CHAR 0xFFFFu -#define STRINGLIB_CHAR Py_UCS2 -#define STRINGLIB_TYPE_NAME "unicode" -#define STRINGLIB_PARSE_CODE "U" -#define STRINGLIB_EMPTY unicode_empty -#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE -#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK -#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL -#define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL -#define STRINGLIB_STR PyUnicode_2BYTE_DATA -#define STRINGLIB_LEN PyUnicode_GET_LENGTH -#define STRINGLIB_NEW _PyUnicode_FromUCS2 -#define STRINGLIB_CHECK PyUnicode_Check -#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact - -#define STRINGLIB_TOSTR PyObject_Str -#define STRINGLIB_TOASCII PyObject_ASCII - -#define _Py_InsertThousandsGrouping _PyUnicode_ucs2_InsertThousandsGrouping - diff --git a/Objects/stringlib/ucs4lib.h b/Objects/stringlib/ucs4lib.h deleted file mode 100644 index 86a480f..0000000 --- a/Objects/stringlib/ucs4lib.h +++ /dev/null @@ -1,29 +0,0 @@ -/* this is sort of a hack. there's at least one place (formatting - floats) where some stringlib code takes a different path if it's - compiled as unicode. */ -#define STRINGLIB_IS_UNICODE 1 - -#define FASTSEARCH ucs4lib_fastsearch -#define STRINGLIB(F) ucs4lib_##F -#define STRINGLIB_OBJECT PyUnicodeObject -#define STRINGLIB_SIZEOF_CHAR 4 -#define STRINGLIB_MAX_CHAR 0x10FFFFu -#define STRINGLIB_CHAR Py_UCS4 -#define STRINGLIB_TYPE_NAME "unicode" -#define STRINGLIB_PARSE_CODE "U" -#define STRINGLIB_EMPTY unicode_empty -#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE -#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK -#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL -#define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL -#define STRINGLIB_STR PyUnicode_4BYTE_DATA -#define STRINGLIB_LEN PyUnicode_GET_LENGTH -#define STRINGLIB_NEW _PyUnicode_FromUCS4 -#define STRINGLIB_CHECK PyUnicode_Check -#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact - -#define STRINGLIB_TOSTR PyObject_Str -#define STRINGLIB_TOASCII PyObject_ASCII - -#define _Py_InsertThousandsGrouping _PyUnicode_ucs4_InsertThousandsGrouping - diff --git a/Objects/stringlib/undef.h b/Objects/stringlib/undef.h deleted file mode 100644 index f9d3f1d..0000000 --- a/Objects/stringlib/undef.h +++ /dev/null @@ -1,11 +0,0 @@ -#undef FASTSEARCH -#undef STRINGLIB -#undef STRINGLIB_SIZEOF_CHAR -#undef STRINGLIB_MAX_CHAR -#undef STRINGLIB_CHAR -#undef STRINGLIB_STR -#undef STRINGLIB_LEN -#undef STRINGLIB_NEW -#undef _Py_InsertThousandsGrouping -#undef STRINGLIB_IS_UNICODE - diff --git a/Objects/stringlib/unicodedefs.h b/Objects/stringlib/unicodedefs.h index 3db5629..dd814f6 100644 --- a/Objects/stringlib/unicodedefs.h +++ b/Objects/stringlib/unicodedefs.h @@ -6,10 +6,7 @@ compiled as unicode. */ #define STRINGLIB_IS_UNICODE 1 -#define FASTSEARCH fastsearch -#define STRINGLIB(F) stringlib_##F #define STRINGLIB_OBJECT PyUnicodeObject -#define STRINGLIB_SIZEOF_CHAR Py_UNICODE_SIZE #define STRINGLIB_CHAR Py_UNICODE #define STRINGLIB_TYPE_NAME "unicode" #define STRINGLIB_PARSE_CODE "U" @@ -18,14 +15,22 @@ #define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK #define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL #define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL +#define STRINGLIB_TOUPPER Py_UNICODE_TOUPPER +#define STRINGLIB_TOLOWER Py_UNICODE_TOLOWER +#define STRINGLIB_FILL Py_UNICODE_FILL #define STRINGLIB_STR PyUnicode_AS_UNICODE #define STRINGLIB_LEN PyUnicode_GET_SIZE #define STRINGLIB_NEW PyUnicode_FromUnicode +#define STRINGLIB_RESIZE PyUnicode_Resize #define STRINGLIB_CHECK PyUnicode_Check #define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact +#define STRINGLIB_GROUPING _PyUnicode_InsertThousandsGrouping +#if PY_VERSION_HEX < 0x03000000 +#define STRINGLIB_TOSTR PyObject_Unicode +#else #define STRINGLIB_TOSTR PyObject_Str -#define STRINGLIB_TOASCII PyObject_ASCII +#endif #define STRINGLIB_WANT_CONTAINS_OBJ 1 |