diff options
author | Martin v. Löwis <martin@v.loewis.de> | 2011-09-28 05:41:54 (GMT) |
---|---|---|
committer | Martin v. Löwis <martin@v.loewis.de> | 2011-09-28 05:41:54 (GMT) |
commit | d63a3b8beb4a0841cb59fb3515347ccaab34b733 (patch) | |
tree | 3b4e3cc63151c5a5a910c3550a190aefaea96ad4 /Modules | |
parent | 48d49497c50e79d14e9df9527d766ca3a0a38be5 (diff) | |
download | cpython-d63a3b8beb4a0841cb59fb3515347ccaab34b733.zip cpython-d63a3b8beb4a0841cb59fb3515347ccaab34b733.tar.gz cpython-d63a3b8beb4a0841cb59fb3515347ccaab34b733.tar.bz2 |
Implement PEP 393.
Diffstat (limited to 'Modules')
-rw-r--r-- | Modules/_codecsmodule.c | 8 | ||||
-rw-r--r-- | Modules/_csv.c | 2 | ||||
-rw-r--r-- | Modules/_ctypes/_ctypes.c | 6 | ||||
-rw-r--r-- | Modules/_ctypes/callproc.c | 8 | ||||
-rw-r--r-- | Modules/_ctypes/cfield.c | 66 | ||||
-rw-r--r-- | Modules/_cursesmodule.c | 7 | ||||
-rw-r--r-- | Modules/_datetimemodule.c | 11 | ||||
-rw-r--r-- | Modules/_dbmmodule.c | 12 | ||||
-rw-r--r-- | Modules/_elementtree.c | 29 | ||||
-rw-r--r-- | Modules/_io/_iomodule.h | 2 | ||||
-rw-r--r-- | Modules/_io/stringio.c | 69 | ||||
-rw-r--r-- | Modules/_io/textio.c | 352 | ||||
-rw-r--r-- | Modules/_json.c | 252 | ||||
-rw-r--r-- | Modules/_pickle.c | 4 | ||||
-rw-r--r-- | Modules/_sqlite/connection.c | 19 | ||||
-rw-r--r-- | Modules/_sre.c | 378 | ||||
-rw-r--r-- | Modules/_testcapimodule.c | 2 | ||||
-rw-r--r-- | Modules/_tkinter.c | 70 | ||||
-rw-r--r-- | Modules/arraymodule.c | 8 | ||||
-rw-r--r-- | Modules/md5module.c | 10 | ||||
-rw-r--r-- | Modules/operator.c | 27 | ||||
-rw-r--r-- | Modules/pyexpat.c | 11 | ||||
-rw-r--r-- | Modules/sha1module.c | 10 | ||||
-rw-r--r-- | Modules/sha256module.c | 10 | ||||
-rw-r--r-- | Modules/sha512module.c | 10 | ||||
-rw-r--r-- | Modules/sre.h | 4 | ||||
-rw-r--r-- | Modules/syslogmodule.c | 14 | ||||
-rw-r--r-- | Modules/unicodedata.c | 28 | ||||
-rw-r--r-- | Modules/zipimport.c | 141 |
29 files changed, 790 insertions, 780 deletions
diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index 4bc0482..c9843c2 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -700,12 +700,10 @@ utf_8_encode(PyObject *self, return NULL; str = PyUnicode_FromObject(str); - if (str == NULL) + if (str == NULL || PyUnicode_READY(str) == -1) return NULL; - v = codec_tuple(PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str), - errors), - PyUnicode_GET_SIZE(str)); + v = codec_tuple(PyUnicode_AsEncodedString(str, "utf-8", errors), + PyUnicode_GET_LENGTH(str)); Py_DECREF(str); return v; } diff --git a/Modules/_csv.c b/Modules/_csv.c index a749cec..1a63da4 100644 --- a/Modules/_csv.c +++ b/Modules/_csv.c @@ -128,7 +128,7 @@ get_nullchar_as_None(Py_UNICODE c) return Py_None; } else - return PyUnicode_FromUnicode((Py_UNICODE *)&c, 1); + return PyUnicode_FromOrdinal(c); } static PyObject * diff --git a/Modules/_ctypes/_ctypes.c b/Modules/_ctypes/_ctypes.c index 3df26da..e2905bc 100644 --- a/Modules/_ctypes/_ctypes.c +++ b/Modules/_ctypes/_ctypes.c @@ -1843,11 +1843,9 @@ PyCSimpleType_new(PyTypeObject *type, PyObject *args, PyObject *kwds) return NULL; } if (PyUnicode_Check(proto)) { - PyObject *v = _PyUnicode_AsDefaultEncodedString(proto); - if (!v) + proto_str = PyUnicode_AsUTF8AndSize(proto, &proto_len); + if (!proto_str) goto error; - proto_str = PyBytes_AS_STRING(v); - proto_len = PyBytes_GET_SIZE(v); } else { PyErr_SetString(PyExc_TypeError, "class must define a '_type_' string attribute"); diff --git a/Modules/_ctypes/callproc.c b/Modules/_ctypes/callproc.c index 1f3c1c0..5a5ce31 100644 --- a/Modules/_ctypes/callproc.c +++ b/Modules/_ctypes/callproc.c @@ -658,13 +658,6 @@ static int ConvParam(PyObject *obj, Py_ssize_t index, struct argument *pa) #ifdef CTYPES_UNICODE if (PyUnicode_Check(obj)) { -#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T - pa->ffi_type = &ffi_type_pointer; - pa->value.p = PyUnicode_AS_UNICODE(obj); - Py_INCREF(obj); - pa->keep = obj; - return 0; -#else pa->ffi_type = &ffi_type_pointer; pa->value.p = PyUnicode_AsWideCharString(obj, NULL); if (pa->value.p == NULL) @@ -675,7 +668,6 @@ static int ConvParam(PyObject *obj, Py_ssize_t index, struct argument *pa) return -1; } return 0; -#endif } #endif diff --git a/Modules/_ctypes/cfield.c b/Modules/_ctypes/cfield.c index 0aa9f0b..d324ca9 100644 --- a/Modules/_ctypes/cfield.c +++ b/Modules/_ctypes/cfield.c @@ -9,7 +9,6 @@ #define CTYPES_CFIELD_CAPSULE_NAME_PYMEM "_ctypes/cfield.c pymem" -#if Py_UNICODE_SIZE != SIZEOF_WCHAR_T static void pymem_destructor(PyObject *ptr) { void *p = PyCapsule_GetPointer(ptr, CTYPES_CFIELD_CAPSULE_NAME_PYMEM); @@ -17,7 +16,6 @@ static void pymem_destructor(PyObject *ptr) PyMem_Free(p); } } -#endif /******************************************************************/ @@ -1238,32 +1236,24 @@ u_get(void *ptr, Py_ssize_t size) static PyObject * U_get(void *ptr, Py_ssize_t size) { - PyObject *result; Py_ssize_t len; - Py_UNICODE *p; + wchar_t *p; size /= sizeof(wchar_t); /* we count character units here, not bytes */ - result = PyUnicode_FromWideChar((wchar_t *)ptr, size); - if (!result) - return NULL; /* We need 'result' to be able to count the characters with wcslen, since ptr may not be NUL terminated. If the length is smaller (if it was actually NUL terminated, we construct a new one and throw away the result. */ /* chop off at the first NUL character, if any. */ - p = PyUnicode_AS_UNICODE(result); - for (len = 0; len < size; ++len) + p = (wchar_t*)ptr; + for (len = 0; len < size; ++len) { if (!p[len]) break; - - if (len < size) { - PyObject *ob = PyUnicode_FromWideChar((wchar_t *)ptr, len); - Py_DECREF(result); - return ob; } - return result; + + return PyUnicode_FromWideChar((wchar_t *)ptr, len); } static PyObject * @@ -1401,6 +1391,9 @@ z_get(void *ptr, Py_ssize_t size) static PyObject * Z_set(void *ptr, PyObject *value, Py_ssize_t size) { + PyObject *keep; + wchar_t *buffer; + if (value == Py_None) { *(wchar_t **)ptr = NULL; Py_INCREF(value); @@ -1420,37 +1413,20 @@ Z_set(void *ptr, PyObject *value, Py_ssize_t size) "unicode string or integer address expected instead of %s instance", value->ob_type->tp_name); return NULL; - } else - Py_INCREF(value); -#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T - /* We can copy directly. Hm, are unicode objects always NUL - terminated in Python, internally? - */ - *(wchar_t **)ptr = (wchar_t *) PyUnicode_AS_UNICODE(value); - return value; -#else - { - /* We must create a wchar_t* buffer from the unicode object, - and keep it alive */ - PyObject *keep; - wchar_t *buffer; - - buffer = PyUnicode_AsWideCharString(value, NULL); - if (!buffer) { - Py_DECREF(value); - return NULL; - } - keep = PyCapsule_New(buffer, CTYPES_CFIELD_CAPSULE_NAME_PYMEM, pymem_destructor); - if (!keep) { - Py_DECREF(value); - PyMem_Free(buffer); - return NULL; - } - *(wchar_t **)ptr = (wchar_t *)buffer; - Py_DECREF(value); - return keep; } -#endif + + /* We must create a wchar_t* buffer from the unicode object, + and keep it alive */ + buffer = PyUnicode_AsWideCharString(value, NULL); + if (!buffer) + return NULL; + keep = PyCapsule_New(buffer, CTYPES_CFIELD_CAPSULE_NAME_PYMEM, pymem_destructor); + if (!keep) { + PyMem_Free(buffer); + return NULL; + } + *(wchar_t **)ptr = buffer; + return keep; } static PyObject * diff --git a/Modules/_cursesmodule.c b/Modules/_cursesmodule.c index ef0a66c..191d53f 100644 --- a/Modules/_cursesmodule.c +++ b/Modules/_cursesmodule.c @@ -203,8 +203,11 @@ PyCurses_ConvertToChtype(PyObject *obj, chtype *ch) } else if(PyBytes_Check(obj) && (PyBytes_Size(obj) == 1)) { *ch = (chtype) *PyBytes_AsString(obj); - } else if (PyUnicode_Check(obj) && PyUnicode_GetSize(obj) == 1) { - *ch = (chtype) *PyUnicode_AS_UNICODE(obj); + } else if (PyUnicode_Check(obj) && PyUnicode_GET_LENGTH(obj) == 1) { + Py_UCS4 ucs = PyUnicode_READ(PyUnicode_KIND(obj), + PyUnicode_DATA(obj), + 0); + *ch = (chtype)ucs; } else { return 0; } diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c index 26e0ed0..a7156a4 100644 --- a/Modules/_datetimemodule.c +++ b/Modules/_datetimemodule.c @@ -985,9 +985,8 @@ append_keyword_tzinfo(PyObject *repr, PyObject *tzinfo) if (tzinfo == Py_None) return repr; /* Get rid of the trailing ')'. */ - assert(PyUnicode_AS_UNICODE(repr)[PyUnicode_GET_SIZE(repr)-1] == ')'); - temp = PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(repr), - PyUnicode_GET_SIZE(repr) - 1); + assert(PyUnicode_READ_CHAR(repr, PyUnicode_GET_LENGTH(repr)-1) == ')'); + temp = PyUnicode_Substring(repr, 0, PyUnicode_GET_LENGTH(repr) - 1); Py_DECREF(repr); if (temp == NULL) return NULL; @@ -4214,9 +4213,9 @@ static PyObject * datetime_strptime(PyObject *cls, PyObject *args) { static PyObject *module = NULL; - const Py_UNICODE *string, *format; + PyObject *string, *format; - if (!PyArg_ParseTuple(args, "uu:strptime", &string, &format)) + if (!PyArg_ParseTuple(args, "UU:strptime", &string, &format)) return NULL; if (module == NULL) { @@ -4224,7 +4223,7 @@ datetime_strptime(PyObject *cls, PyObject *args) if (module == NULL) return NULL; } - return PyObject_CallMethod(module, "_strptime_datetime", "Ouu", + return PyObject_CallMethod(module, "_strptime_datetime", "OOO", cls, string, format); } diff --git a/Modules/_dbmmodule.c b/Modules/_dbmmodule.c index 69a7112..327b873 100644 --- a/Modules/_dbmmodule.c +++ b/Modules/_dbmmodule.c @@ -212,6 +212,7 @@ dbm_contains(PyObject *self, PyObject *arg) { dbmobject *dp = (dbmobject *)self; datum key, val; + Py_ssize_t size; if ((dp)->di_dbm == NULL) { PyErr_SetString(DbmError, @@ -219,8 +220,9 @@ dbm_contains(PyObject *self, PyObject *arg) return -1; } if (PyUnicode_Check(arg)) { - arg = _PyUnicode_AsDefaultEncodedString(arg); - if (arg == NULL) + key.dptr = PyUnicode_AsUTF8AndSize(arg, &size); + key.dsize = size; + if (key.dptr == NULL) return -1; } if (!PyBytes_Check(arg)) { @@ -229,8 +231,10 @@ dbm_contains(PyObject *self, PyObject *arg) arg->ob_type->tp_name); return -1; } - key.dptr = PyBytes_AS_STRING(arg); - key.dsize = PyBytes_GET_SIZE(arg); + else { + key.dptr = PyBytes_AS_STRING(arg); + key.dsize = PyBytes_GET_SIZE(arg); + } val = dbm_fetch(dp->di_dbm, key); return val.dptr != NULL; } diff --git a/Modules/_elementtree.c b/Modules/_elementtree.c index 6373c48..0c64dd5 100644 --- a/Modules/_elementtree.c +++ b/Modules/_elementtree.c @@ -723,13 +723,16 @@ checkpath(PyObject* tag) (ch == '/' || ch == '*' || ch == '[' || ch == '@' || ch == '.') if (PyUnicode_Check(tag)) { - Py_UNICODE *p = PyUnicode_AS_UNICODE(tag); - for (i = 0; i < PyUnicode_GET_SIZE(tag); i++) { - if (p[i] == '{') + const Py_ssize_t len = PyUnicode_GET_LENGTH(tag); + void *data = PyUnicode_DATA(tag); + unsigned int kind = PyUnicode_KIND(tag); + for (i = 0; i < len; i++) { + Py_UCS4 ch = PyUnicode_READ(kind, data, i); + if (ch == '{') check = 0; - else if (p[i] == '}') + else if (ch == '}') check = 1; - else if (check && PATHCHAR(p[i])) + else if (check && PATHCHAR(ch)) return 1; } return 0; @@ -2401,9 +2404,10 @@ expat_unknown_encoding_handler(XMLParserObject *self, const XML_Char *name, XML_Encoding *info) { PyObject* u; - Py_UNICODE* p; unsigned char s[256]; int i; + void *data; + unsigned int kind; memset(info, 0, sizeof(XML_Encoding)); @@ -2413,17 +2417,20 @@ expat_unknown_encoding_handler(XMLParserObject *self, const XML_Char *name, u = PyUnicode_Decode((char*) s, 256, name, "replace"); if (!u) return XML_STATUS_ERROR; + if (PyUnicode_READY(u)) + return XML_STATUS_ERROR; - if (PyUnicode_GET_SIZE(u) != 256) { + if (PyUnicode_GET_LENGTH(u) != 256) { Py_DECREF(u); return XML_STATUS_ERROR; } - p = PyUnicode_AS_UNICODE(u); - + kind = PyUnicode_KIND(u); + data = PyUnicode_DATA(u); for (i = 0; i < 256; i++) { - if (p[i] != Py_UNICODE_REPLACEMENT_CHARACTER) - info->map[i] = p[i]; + Py_UCS4 ch = PyUnicode_READ(kind, data, i); + if (ch != Py_UNICODE_REPLACEMENT_CHARACTER) + info->map[i] = ch; else info->map[i] = -1; } diff --git a/Modules/_io/_iomodule.h b/Modules/_io/_iomodule.h index 9174bdd..4e97dd1 100644 --- a/Modules/_io/_iomodule.h +++ b/Modules/_io/_iomodule.h @@ -55,7 +55,7 @@ extern PyObject *_PyIncrementalNewlineDecoder_decode( Otherwise, the function will scan further and return garbage. */ extern Py_ssize_t _PyIO_find_line_ending( int translated, int universal, PyObject *readnl, - Py_UNICODE *start, Py_UNICODE *end, Py_ssize_t *consumed); + int kind, char *start, char *end, Py_ssize_t *consumed); #define DEFAULT_BUFFER_SIZE (8 * 1024) /* bytes */ diff --git a/Modules/_io/stringio.c b/Modules/_io/stringio.c index c9d14b1..c40163f 100644 --- a/Modules/_io/stringio.c +++ b/Modules/_io/stringio.c @@ -9,7 +9,7 @@ typedef struct { PyObject_HEAD - Py_UNICODE *buf; + Py_UCS4 *buf; Py_ssize_t pos; Py_ssize_t string_size; size_t buf_size; @@ -21,7 +21,7 @@ typedef struct { PyObject *decoder; PyObject *readnl; PyObject *writenl; - + PyObject *dict; PyObject *weakreflist; } stringio; @@ -56,7 +56,7 @@ resize_buffer(stringio *self, size_t size) /* Here, unsigned types are used to avoid dealing with signed integer overflow, which is undefined in C. */ size_t alloc = self->buf_size; - Py_UNICODE *new_buf = NULL; + Py_UCS4 *new_buf = NULL; assert(self->buf != NULL); @@ -84,10 +84,9 @@ resize_buffer(stringio *self, size_t size) alloc = size + 1; } - if (alloc > ((size_t)-1) / sizeof(Py_UNICODE)) + if (alloc > PY_SIZE_MAX / sizeof(Py_UCS4)) goto overflow; - new_buf = (Py_UNICODE *)PyMem_Realloc(self->buf, - alloc * sizeof(Py_UNICODE)); + new_buf = (Py_UCS4 *)PyMem_Realloc(self->buf, alloc * sizeof(Py_UCS4)); if (new_buf == NULL) { PyErr_NoMemory(); return -1; @@ -108,9 +107,9 @@ resize_buffer(stringio *self, size_t size) static Py_ssize_t write_str(stringio *self, PyObject *obj) { - Py_UNICODE *str; Py_ssize_t len; PyObject *decoded = NULL; + assert(self->buf != NULL); assert(self->pos >= 0); @@ -132,8 +131,7 @@ write_str(stringio *self, PyObject *obj) return -1; assert(PyUnicode_Check(decoded)); - str = PyUnicode_AS_UNICODE(decoded); - len = PyUnicode_GET_SIZE(decoded); + len = PyUnicode_GET_LENGTH(decoded); assert(len >= 0); @@ -161,18 +159,21 @@ write_str(stringio *self, PyObject *obj) */ memset(self->buf + self->string_size, '\0', - (self->pos - self->string_size) * sizeof(Py_UNICODE)); + (self->pos - self->string_size) * sizeof(Py_UCS4)); } /* Copy the data to the internal buffer, overwriting some of the existing data if self->pos < self->string_size. */ - memcpy(self->buf + self->pos, str, len * sizeof(Py_UNICODE)); - self->pos += len; + if (!PyUnicode_AsUCS4(decoded, + self->buf + self->pos, + self->buf_size - self->pos, + 0)) + goto fail; /* Set the new length of the internal string if it has changed. */ - if (self->string_size < self->pos) { + self->pos += len; + if (self->string_size < self->pos) self->string_size = self->pos; - } Py_DECREF(decoded); return 0; @@ -190,7 +191,8 @@ stringio_getvalue(stringio *self) { CHECK_INITIALIZED(self); CHECK_CLOSED(self); - return PyUnicode_FromUnicode(self->buf, self->string_size); + return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, self->buf, + self->string_size); } PyDoc_STRVAR(stringio_tell_doc, @@ -214,7 +216,7 @@ static PyObject * stringio_read(stringio *self, PyObject *args) { Py_ssize_t size, n; - Py_UNICODE *output; + Py_UCS4 *output; PyObject *arg = Py_None; CHECK_INITIALIZED(self); @@ -247,19 +249,19 @@ stringio_read(stringio *self, PyObject *args) output = self->buf + self->pos; self->pos += size; - return PyUnicode_FromUnicode(output, size); + return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, size); } /* Internal helper, used by stringio_readline and stringio_iternext */ static PyObject * _stringio_readline(stringio *self, Py_ssize_t limit) { - Py_UNICODE *start, *end, old_char; + Py_UCS4 *start, *end, old_char; Py_ssize_t len, consumed; /* In case of overseek, return the empty string */ if (self->pos >= self->string_size) - return PyUnicode_FromString(""); + return PyUnicode_New(0, 0); start = self->buf + self->pos; if (limit < 0 || limit > self->string_size - self->pos) @@ -270,14 +272,14 @@ _stringio_readline(stringio *self, Py_ssize_t limit) *end = '\0'; len = _PyIO_find_line_ending( self->readtranslate, self->readuniversal, self->readnl, - start, end, &consumed); + PyUnicode_4BYTE_KIND, (char*)start, (char*)end, &consumed); *end = old_char; /* If we haven't found any line ending, we just return everything (`consumed` is ignored). */ if (len < 0) len = limit; self->pos += len; - return PyUnicode_FromUnicode(start, len); + return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, start, len); } PyDoc_STRVAR(stringio_readline_doc, @@ -462,8 +464,10 @@ stringio_write(stringio *self, PyObject *obj) Py_TYPE(obj)->tp_name); return NULL; } + if (PyUnicode_READY(obj)) + return NULL; CHECK_CLOSED(self); - size = PyUnicode_GET_SIZE(obj); + size = PyUnicode_GET_LENGTH(obj); if (size > 0 && write_str(self, obj) < 0) return NULL; @@ -535,7 +539,7 @@ stringio_new(PyTypeObject *type, PyObject *args, PyObject *kwds) /* tp_alloc initializes all the fields to zero. So we don't have to initialize them here. */ - self->buf = (Py_UNICODE *)PyMem_Malloc(0); + self->buf = (Py_UCS4 *)PyMem_Malloc(0); if (self->buf == NULL) { Py_DECREF(self); return PyErr_NoMemory(); @@ -747,11 +751,22 @@ stringio_setstate(stringio *self, PyObject *state) once by __init__. So we do not take any chance and replace object's buffer completely. */ { - Py_UNICODE *buf = PyUnicode_AS_UNICODE(PyTuple_GET_ITEM(state, 0)); - Py_ssize_t bufsize = PyUnicode_GET_SIZE(PyTuple_GET_ITEM(state, 0)); - if (resize_buffer(self, bufsize) < 0) + PyObject *item; + Py_UCS4 *buf; + Py_ssize_t bufsize; + + item = PyTuple_GET_ITEM(state, 0); + buf = PyUnicode_AsUCS4Copy(item); + if (buf == NULL) return NULL; - memcpy(self->buf, buf, bufsize * sizeof(Py_UNICODE)); + bufsize = PyUnicode_GET_LENGTH(item); + + if (resize_buffer(self, bufsize) < 0) { + PyMem_Free(buf); + return NULL; + } + memcpy(self->buf, buf, bufsize * sizeof(Py_UCS4)); + PyMem_Free(buf); self->string_size = bufsize; } diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c index 13d4bd9..9c06ec8 100644 --- a/Modules/_io/textio.c +++ b/Modules/_io/textio.c @@ -274,18 +274,28 @@ _PyIncrementalNewlineDecoder_decode(PyObject *_self, goto error; } - output_len = PyUnicode_GET_SIZE(output); + if (PyUnicode_READY(output) == -1) + goto error; + + output_len = PyUnicode_GET_LENGTH(output); if (self->pendingcr && (final || output_len > 0)) { - Py_UNICODE *out; - PyObject *modified = PyUnicode_FromUnicode(NULL, output_len + 1); + /* Prefix output with CR */ + int kind; + PyObject *modified; + char *out; + + modified = PyUnicode_New(output_len + 1, + PyUnicode_MAX_CHAR_VALUE(output)); if (modified == NULL) goto error; - out = PyUnicode_AS_UNICODE(modified); - out[0] = '\r'; - memcpy(out + 1, PyUnicode_AS_UNICODE(output), - output_len * sizeof(Py_UNICODE)); + kind = PyUnicode_KIND(modified); + out = PyUnicode_DATA(modified); + PyUnicode_WRITE(kind, PyUnicode_DATA(modified), 0, '\r'); + memcpy(out + PyUnicode_KIND_SIZE(kind, 1), + PyUnicode_DATA(output), + PyUnicode_KIND_SIZE(kind, output_len)); Py_DECREF(output); - output = modified; + output = modified; /* output remains ready */ self->pendingcr = 0; output_len++; } @@ -295,21 +305,13 @@ _PyIncrementalNewlineDecoder_decode(PyObject *_self, */ if (!final) { if (output_len > 0 - && PyUnicode_AS_UNICODE(output)[output_len - 1] == '\r') { - - if (Py_REFCNT(output) == 1) { - if (PyUnicode_Resize(&output, output_len - 1) < 0) - goto error; - } - else { - PyObject *modified = PyUnicode_FromUnicode( - PyUnicode_AS_UNICODE(output), - output_len - 1); - if (modified == NULL) - goto error; - Py_DECREF(output); - output = modified; - } + && PyUnicode_READ_CHAR(output, output_len - 1) == '\r') + { + PyObject *modified = PyUnicode_Substring(output, 0, output_len -1); + if (modified == NULL) + goto error; + Py_DECREF(output); + output = modified; self->pendingcr = 1; } } @@ -317,13 +319,15 @@ _PyIncrementalNewlineDecoder_decode(PyObject *_self, /* Record which newlines are read and do newline translation if desired, all in one pass. */ { - Py_UNICODE *in_str; + void *in_str; Py_ssize_t len; int seennl = self->seennl; int only_lf = 0; + int kind; - in_str = PyUnicode_AS_UNICODE(output); - len = PyUnicode_GET_SIZE(output); + in_str = PyUnicode_DATA(output); + len = PyUnicode_GET_LENGTH(output); + kind = PyUnicode_KIND(output); if (len == 0) return output; @@ -332,7 +336,7 @@ _PyIncrementalNewlineDecoder_decode(PyObject *_self, for the \r *byte* with the libc's optimized memchr. */ if (seennl == SEEN_LF || seennl == 0) { - only_lf = (memchr(in_str, '\r', len * sizeof(Py_UNICODE)) == NULL); + only_lf = (memchr(in_str, '\r', PyUnicode_KIND_SIZE(kind, len)) == NULL); } if (only_lf) { @@ -340,21 +344,19 @@ _PyIncrementalNewlineDecoder_decode(PyObject *_self, (there's nothing else to be done, even when in translation mode) */ if (seennl == 0 && - memchr(in_str, '\n', len * sizeof(Py_UNICODE)) != NULL) { - Py_UNICODE *s, *end; - s = in_str; - end = in_str + len; + memchr(in_str, '\n', PyUnicode_KIND_SIZE(kind, len)) != NULL) { + Py_ssize_t i = 0; for (;;) { Py_UNICODE c; /* Fast loop for non-control characters */ - while (*s > '\n') - s++; - c = *s++; + while (PyUnicode_READ(kind, in_str, i) > '\n') + i++; + c = PyUnicode_READ(kind, in_str, i++); if (c == '\n') { seennl |= SEEN_LF; break; } - if (s > end) + if (i >= len) break; } } @@ -362,29 +364,27 @@ _PyIncrementalNewlineDecoder_decode(PyObject *_self, need translating */ } else if (!self->translate) { - Py_UNICODE *s, *end; + Py_ssize_t i = 0; /* We have already seen all newline types, no need to scan again */ if (seennl == SEEN_ALL) goto endscan; - s = in_str; - end = in_str + len; for (;;) { - Py_UNICODE c; + Py_UCS4 c; /* Fast loop for non-control characters */ - while (*s > '\r') - s++; - c = *s++; + while (PyUnicode_READ(kind, in_str, i) > '\r') + i++; + c = PyUnicode_READ(kind, in_str, i++); if (c == '\n') seennl |= SEEN_LF; else if (c == '\r') { - if (*s == '\n') { + if (PyUnicode_READ(kind, in_str, i) == '\n') { seennl |= SEEN_CRLF; - s++; + i++; } else seennl |= SEEN_CR; } - if (s > end) + if (i >= len) break; if (seennl == SEEN_ALL) break; @@ -393,61 +393,50 @@ _PyIncrementalNewlineDecoder_decode(PyObject *_self, ; } else { - PyObject *translated = NULL; - Py_UNICODE *out_str; - Py_UNICODE *in, *out, *end; - if (Py_REFCNT(output) != 1) { - /* We could try to optimize this so that we only do a copy - when there is something to translate. On the other hand, - most decoders should only output non-shared strings, i.e. - translation is done in place. */ - translated = PyUnicode_FromUnicode(NULL, len); - if (translated == NULL) - goto error; - assert(Py_REFCNT(translated) == 1); - memcpy(PyUnicode_AS_UNICODE(translated), - PyUnicode_AS_UNICODE(output), - len * sizeof(Py_UNICODE)); - } - else { - translated = output; + void *translated; + int kind = PyUnicode_KIND(output); + void *in_str = PyUnicode_DATA(output); + Py_ssize_t in, out; + /* XXX: Previous in-place translation here is disabled as + resizing is not possible anymore */ + /* We could try to optimize this so that we only do a copy + when there is something to translate. On the other hand, + we already know there is a \r byte, so chances are high + that something needs to be done. */ + translated = PyMem_Malloc(PyUnicode_KIND_SIZE(kind, len)); + if (translated == NULL) { + PyErr_NoMemory(); + goto error; } - out_str = PyUnicode_AS_UNICODE(translated); - in = in_str; - out = out_str; - end = in_str + len; + in = out = 0; for (;;) { - Py_UNICODE c; + Py_UCS4 c; /* Fast loop for non-control characters */ - while ((c = *in++) > '\r') - *out++ = c; + while ((c = PyUnicode_READ(kind, in_str, in++)) > '\r') + PyUnicode_WRITE(kind, translated, out++, c); if (c == '\n') { - *out++ = c; + PyUnicode_WRITE(kind, translated, out++, c); seennl |= SEEN_LF; continue; } if (c == '\r') { - if (*in == '\n') { + if (PyUnicode_READ(kind, in_str, in) == '\n') { in++; seennl |= SEEN_CRLF; } else seennl |= SEEN_CR; - *out++ = '\n'; + PyUnicode_WRITE(kind, translated, out++, '\n'); continue; } - if (in > end) + if (in > len) break; - *out++ = c; - } - if (translated != output) { - Py_DECREF(output); - output = translated; - } - if (out - out_str != len) { - if (PyUnicode_Resize(&output, out - out_str) < 0) - goto error; + PyUnicode_WRITE(kind, translated, out++, c); } + Py_DECREF(output); + output = PyUnicode_FromKindAndData(kind, translated, out); + if (!output) + goto error; } self->seennl |= seennl; } @@ -705,9 +694,7 @@ typedef struct static PyObject * ascii_encode(textio *self, PyObject *text) { - return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(text), - PyUnicode_GET_SIZE(text), - PyBytes_AS_STRING(self->errors)); + return _PyUnicode_AsASCIIString(text, PyBytes_AS_STRING(self->errors)); } static PyObject * @@ -777,17 +764,13 @@ utf32_encode(textio *self, PyObject *text) static PyObject * utf8_encode(textio *self, PyObject *text) { - return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(text), - PyUnicode_GET_SIZE(text), - PyBytes_AS_STRING(self->errors)); + return _PyUnicode_AsUTF8String(text, PyBytes_AS_STRING(self->errors)); } static PyObject * latin1_encode(textio *self, PyObject *text) { - return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(text), - PyUnicode_GET_SIZE(text), - PyBytes_AS_STRING(self->errors)); + return _PyUnicode_AsLatin1String(text, PyBytes_AS_STRING(self->errors)); } /* Map normalized encoding names onto the specialized encoding funcs */ @@ -1213,18 +1196,6 @@ textiowrapper_detach(textio *self) return buffer; } -Py_LOCAL_INLINE(const Py_UNICODE *) -findchar(const Py_UNICODE *s, Py_ssize_t size, Py_UNICODE ch) -{ - /* like wcschr, but doesn't stop at NULL characters */ - while (size-- > 0) { - if (*s == ch) - return s; - s++; - } - return NULL; -} - /* Flush the internal write buffer. This doesn't explicitly flush the underlying buffered object, though. */ static int @@ -1269,6 +1240,9 @@ textiowrapper_write(textio *self, PyObject *args) return NULL; } + if (PyUnicode_READY(text) == -1) + return NULL; + CHECK_CLOSED(self); if (self->encoder == NULL) @@ -1276,11 +1250,10 @@ textiowrapper_write(textio *self, PyObject *args) Py_INCREF(text); - textlen = PyUnicode_GetSize(text); + textlen = PyUnicode_GET_LENGTH(text); if ((self->writetranslate && self->writenl != NULL) || self->line_buffering) - if (findchar(PyUnicode_AS_UNICODE(text), - PyUnicode_GET_SIZE(text), '\n')) + if (PyUnicode_FindChar(text, '\n', 0, PyUnicode_GET_LENGTH(text), 1) != -1) haslf = 1; if (haslf && self->writetranslate && self->writenl != NULL) { @@ -1296,8 +1269,7 @@ textiowrapper_write(textio *self, PyObject *args) needflush = 1; else if (self->line_buffering && (haslf || - findchar(PyUnicode_AS_UNICODE(text), - PyUnicode_GET_SIZE(text), '\r'))) + PyUnicode_FindChar(text, '\r', 0, PyUnicode_GET_LENGTH(text), 1) != -1)) needflush = 1; /* XXX What if we were just reading? */ @@ -1369,7 +1341,8 @@ textiowrapper_get_decoded_chars(textio *self, Py_ssize_t n) if (self->decoded_chars == NULL) return PyUnicode_FromStringAndSize(NULL, 0); - avail = (PyUnicode_GET_SIZE(self->decoded_chars) + /* decoded_chars is guaranteed to be "ready". */ + avail = (PyUnicode_GET_LENGTH(self->decoded_chars) - self->decoded_chars_used); assert(avail >= 0); @@ -1378,9 +1351,9 @@ textiowrapper_get_decoded_chars(textio *self, Py_ssize_t n) n = avail; if (self->decoded_chars_used > 0 || n < avail) { - chars = PyUnicode_FromUnicode( - PyUnicode_AS_UNICODE(self->decoded_chars) - + self->decoded_chars_used, n); + chars = PyUnicode_Substring(self->decoded_chars, + self->decoded_chars_used, + self->decoded_chars_used + n); if (chars == NULL) return NULL; } @@ -1464,8 +1437,10 @@ textiowrapper_read_chunk(textio *self) /* TODO sanity check: isinstance(decoded_chars, unicode) */ if (decoded_chars == NULL) goto fail; + if (PyUnicode_READY(decoded_chars) == -1) + goto fail; textiowrapper_set_decoded_chars(self, decoded_chars); - nchars = PyUnicode_GET_SIZE(decoded_chars); + nchars = PyUnicode_GET_LENGTH(decoded_chars); if (nchars > 0) self->b2cratio = (double) nbytes / nchars; else @@ -1553,7 +1528,9 @@ textiowrapper_read(textio *self, PyObject *args) result = textiowrapper_get_decoded_chars(self, n); if (result == NULL) goto fail; - remaining -= PyUnicode_GET_SIZE(result); + if (PyUnicode_READY(result) == -1) + goto fail; + remaining -= PyUnicode_GET_LENGTH(result); /* Keep reading chunks until we have n characters to return */ while (remaining > 0) { @@ -1573,7 +1550,7 @@ textiowrapper_read(textio *self, PyObject *args) result = textiowrapper_get_decoded_chars(self, remaining); if (result == NULL) goto fail; - remaining -= PyUnicode_GET_SIZE(result); + remaining -= PyUnicode_GET_LENGTH(result); } if (chunks != NULL) { if (result != NULL && PyList_Append(chunks, result) < 0) @@ -1596,33 +1573,34 @@ textiowrapper_read(textio *self, PyObject *args) /* NOTE: `end` must point to the real end of the Py_UNICODE storage, that is to the NUL character. Otherwise the function will produce incorrect results. */ -static Py_UNICODE * -find_control_char(Py_UNICODE *start, Py_UNICODE *end, Py_UNICODE ch) +static char * +find_control_char(int kind, char *s, char *end, Py_UCS4 ch) { - Py_UNICODE *s = start; + int size = PyUnicode_KIND_SIZE(kind, 1); for (;;) { - while (*s > ch) - s++; - if (*s == ch) + while (PyUnicode_READ(kind, s, 0) > ch) + s += size; + if (PyUnicode_READ(kind, s, 0) == ch) return s; if (s == end) return NULL; - s++; + s += size; } } Py_ssize_t _PyIO_find_line_ending( int translated, int universal, PyObject *readnl, - Py_UNICODE *start, Py_UNICODE *end, Py_ssize_t *consumed) + int kind, char *start, char *end, Py_ssize_t *consumed) { - Py_ssize_t len = end - start; + int size = PyUnicode_KIND_SIZE(kind, 1); + Py_ssize_t len = ((char*)end - (char*)start)/size; if (translated) { /* Newlines are already translated, only search for \n */ - Py_UNICODE *pos = find_control_char(start, end, '\n'); + char *pos = find_control_char(kind, start, end, '\n'); if (pos != NULL) - return pos - start + 1; + return (pos - start)/size + 1; else { *consumed = len; return -1; @@ -1632,63 +1610,66 @@ _PyIO_find_line_ending( /* Universal newline search. Find any of \r, \r\n, \n * The decoder ensures that \r\n are not split in two pieces */ - Py_UNICODE *s = start; + char *s = start; for (;;) { - Py_UNICODE ch; + Py_UCS4 ch; /* Fast path for non-control chars. The loop always ends since the Py_UNICODE storage is NUL-terminated. */ - while (*s > '\r') - s++; + while (PyUnicode_READ(kind, s, 0) > '\r') + s += size; if (s >= end) { *consumed = len; return -1; } - ch = *s++; + ch = PyUnicode_READ(kind, s, 0); + s += size; if (ch == '\n') - return s - start; + return (s - start)/size; if (ch == '\r') { - if (*s == '\n') - return s - start + 1; + if (PyUnicode_READ(kind, s, 0) == '\n') + return (s - start)/size + 1; else - return s - start; + return (s - start)/size; } } } else { /* Non-universal mode. */ - Py_ssize_t readnl_len = PyUnicode_GET_SIZE(readnl); - Py_UNICODE *nl = PyUnicode_AS_UNICODE(readnl); + Py_ssize_t readnl_len = PyUnicode_GET_LENGTH(readnl); + char *nl = PyUnicode_DATA(readnl); + /* Assume that readnl is an ASCII character. */ + assert(PyUnicode_KIND(readnl) == PyUnicode_1BYTE_KIND); if (readnl_len == 1) { - Py_UNICODE *pos = find_control_char(start, end, nl[0]); + char *pos = find_control_char(kind, start, end, nl[0]); if (pos != NULL) - return pos - start + 1; + return (pos - start)/size + 1; *consumed = len; return -1; } else { - Py_UNICODE *s = start; - Py_UNICODE *e = end - readnl_len + 1; - Py_UNICODE *pos; + char *s = start; + char *e = end - (readnl_len - 1)*size; + char *pos; if (e < s) e = s; while (s < e) { Py_ssize_t i; - Py_UNICODE *pos = find_control_char(s, end, nl[0]); + char *pos = find_control_char(kind, s, end, nl[0]); if (pos == NULL || pos >= e) break; for (i = 1; i < readnl_len; i++) { - if (pos[i] != nl[i]) + if (PyUnicode_READ(kind, pos, i) != nl[i]) break; } if (i == readnl_len) - return pos - start + readnl_len; - s = pos + 1; + return (pos - start)/size + readnl_len; + s = pos + size; } - pos = find_control_char(e, end, nl[0]); + pos = find_control_char(kind, e, end, nl[0]); if (pos == NULL) *consumed = len; else - *consumed = pos - start; + *consumed = (pos - start)/size; return -1; } } @@ -1709,14 +1690,15 @@ _textiowrapper_readline(textio *self, Py_ssize_t limit) chunked = 0; while (1) { - Py_UNICODE *ptr; + char *ptr; Py_ssize_t line_len; + int kind; Py_ssize_t consumed = 0; /* First, get some data if necessary */ res = 1; while (!self->decoded_chars || - !PyUnicode_GET_SIZE(self->decoded_chars)) { + !PyUnicode_GET_LENGTH(self->decoded_chars)) { res = textiowrapper_read_chunk(self); if (res < 0) goto error; @@ -1741,18 +1723,24 @@ _textiowrapper_readline(textio *self, Py_ssize_t limit) assert(self->decoded_chars_used == 0); line = PyUnicode_Concat(remaining, self->decoded_chars); start = 0; - offset_to_buffer = PyUnicode_GET_SIZE(remaining); + offset_to_buffer = PyUnicode_GET_LENGTH(remaining); Py_CLEAR(remaining); if (line == NULL) goto error; + if (PyUnicode_READY(line) == -1) + goto error; } - ptr = PyUnicode_AS_UNICODE(line); - line_len = PyUnicode_GET_SIZE(line); + ptr = PyUnicode_DATA(line); + line_len = PyUnicode_GET_LENGTH(line); + kind = PyUnicode_KIND(line); endpos = _PyIO_find_line_ending( self->readtranslate, self->readuniversal, self->readnl, - ptr + start, ptr + line_len, &consumed); + kind, + ptr + PyUnicode_KIND_SIZE(kind, start), + ptr + PyUnicode_KIND_SIZE(kind, line_len), + &consumed); if (endpos >= 0) { endpos += start; if (limit >= 0 && (endpos - start) + chunked >= limit) @@ -1776,21 +1764,20 @@ _textiowrapper_readline(textio *self, Py_ssize_t limit) if (chunks == NULL) goto error; } - s = PyUnicode_FromUnicode(ptr + start, endpos - start); + s = PyUnicode_Substring(line, start, endpos); if (s == NULL) goto error; if (PyList_Append(chunks, s) < 0) { Py_DECREF(s); goto error; } - chunked += PyUnicode_GET_SIZE(s); + chunked += PyUnicode_GET_LENGTH(s); Py_DECREF(s); } /* There may be some remaining bytes we'll have to prepend to the next chunk of data */ if (endpos < line_len) { - remaining = PyUnicode_FromUnicode( - ptr + endpos, line_len - endpos); + remaining = PyUnicode_Substring(line, endpos, line_len); if (remaining == NULL) goto error; } @@ -1802,19 +1789,12 @@ _textiowrapper_readline(textio *self, Py_ssize_t limit) if (line != NULL) { /* Our line ends in the current buffer */ self->decoded_chars_used = endpos - offset_to_buffer; - if (start > 0 || endpos < PyUnicode_GET_SIZE(line)) { - if (start == 0 && Py_REFCNT(line) == 1) { - if (PyUnicode_Resize(&line, endpos) < 0) - goto error; - } - else { - PyObject *s = PyUnicode_FromUnicode( - PyUnicode_AS_UNICODE(line) + start, endpos - start); - Py_CLEAR(line); - if (s == NULL) - goto error; - line = s; - } + if (start > 0 || endpos < PyUnicode_GET_LENGTH(line)) { + PyObject *s = PyUnicode_Substring(line, start, endpos); + Py_CLEAR(line); + if (s == NULL) + goto error; + line = s; } } if (remaining != NULL) { @@ -1828,16 +1808,20 @@ _textiowrapper_readline(textio *self, Py_ssize_t limit) Py_CLEAR(remaining); } if (chunks != NULL) { - if (line != NULL && PyList_Append(chunks, line) < 0) - goto error; - Py_CLEAR(line); + if (line != NULL) { + if (PyList_Append(chunks, line) < 0) + goto error; + Py_DECREF(line); + } line = PyUnicode_Join(_PyIO_empty_str, chunks); if (line == NULL) goto error; - Py_DECREF(chunks); + Py_CLEAR(chunks); + } + if (line == NULL) { + Py_INCREF(_PyIO_empty_str); + line = _PyIO_empty_str; } - if (line == NULL) - line = PyUnicode_FromStringAndSize(NULL, 0); return line; @@ -2128,6 +2112,10 @@ textiowrapper_seek(textio *self, PyObject *args) if (decoded == NULL) goto fail; + if (PyUnicode_READY(decoded) == -1) { + Py_DECREF(decoded); + goto fail; + } textiowrapper_set_decoded_chars(self, decoded); @@ -2250,7 +2238,7 @@ textiowrapper_tell(textio *self, PyObject *args) if (_decoded == NULL) \ goto fail; \ assert (PyUnicode_Check(_decoded)); \ - res = PyUnicode_GET_SIZE(_decoded); \ + res = PyUnicode_GET_LENGTH(_decoded); \ Py_DECREF(_decoded); \ } while (0) @@ -2333,7 +2321,7 @@ textiowrapper_tell(textio *self, PyObject *args) if (decoded == NULL) goto fail; assert (PyUnicode_Check(decoded)); - chars_decoded += PyUnicode_GET_SIZE(decoded); + chars_decoded += PyUnicode_GET_LENGTH(decoded); Py_DECREF(decoded); cookie.need_eof = 1; @@ -2559,10 +2547,10 @@ textiowrapper_iternext(textio *self) } } - if (line == NULL) + if (line == NULL || PyUnicode_READY(line) == -1) return NULL; - if (PyUnicode_GET_SIZE(line) == 0) { + if (PyUnicode_GET_LENGTH(line) == 0) { /* Reached EOF or would have blocked */ Py_DECREF(line); Py_CLEAR(self->snapshot); diff --git a/Modules/_json.c b/Modules/_json.c index d5120fa..863a30f 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -238,13 +238,6 @@ encoder_encode_float(PyEncoderObject *s, PyObject *obj); #define S_CHAR(c) (c >= ' ' && c <= '~' && c != '\\' && c != '"') #define IS_WHITESPACE(c) (((c) == ' ') || ((c) == '\t') || ((c) == '\n') || ((c) == '\r')) -#define MIN_EXPANSION 6 -#ifdef Py_UNICODE_WIDE -#define MAX_EXPANSION (2 * MIN_EXPANSION) -#else -#define MAX_EXPANSION MIN_EXPANSION -#endif - static int _convertPyInt_AsSsize_t(PyObject *o, Py_ssize_t *size_ptr) { @@ -263,7 +256,7 @@ _convertPyInt_FromSsize_t(Py_ssize_t *size_ptr) } static Py_ssize_t -ascii_escape_unichar(Py_UNICODE c, Py_UNICODE *output, Py_ssize_t chars) +ascii_escape_unichar(Py_UCS4 c, unsigned char *output, Py_ssize_t chars) { /* Escape unicode code point c to ASCII escape sequences in char *output. output must have at least 12 bytes unused to @@ -278,10 +271,9 @@ ascii_escape_unichar(Py_UNICODE c, Py_UNICODE *output, Py_ssize_t chars) case '\r': output[chars++] = 'r'; break; case '\t': output[chars++] = 't'; break; default: -#ifdef Py_UNICODE_WIDE if (c >= 0x10000) { /* UTF-16 surrogate pair */ - Py_UNICODE v = c - 0x10000; + Py_UCS4 v = c - 0x10000; c = 0xd800 | ((v >> 10) & 0x3ff); output[chars++] = 'u'; output[chars++] = "0123456789abcdef"[(c >> 12) & 0xf]; @@ -291,7 +283,6 @@ ascii_escape_unichar(Py_UNICODE c, Py_UNICODE *output, Py_ssize_t chars) c = 0xdc00 | (v & 0x3ff); output[chars++] = '\\'; } -#endif output[chars++] = 'u'; output[chars++] = "0123456789abcdef"[(c >> 12) & 0xf]; output[chars++] = "0123456789abcdef"[(c >> 8) & 0xf]; @@ -308,54 +299,52 @@ ascii_escape_unicode(PyObject *pystr) Py_ssize_t i; Py_ssize_t input_chars; Py_ssize_t output_size; - Py_ssize_t max_output_size; Py_ssize_t chars; PyObject *rval; - Py_UNICODE *output; - Py_UNICODE *input_unicode; + void *input; + unsigned char *output; + int kind; - input_chars = PyUnicode_GET_SIZE(pystr); - input_unicode = PyUnicode_AS_UNICODE(pystr); + if (PyUnicode_READY(pystr) == -1) + return NULL; + + input_chars = PyUnicode_GET_LENGTH(pystr); + input = PyUnicode_DATA(pystr); + kind = PyUnicode_KIND(pystr); + + /* Compute the output size */ + for (i = 0, output_size = 2; i < input_chars; i++) { + Py_UCS4 c = PyUnicode_READ(kind, input, i); + if (S_CHAR(c)) + output_size++; + else { + switch(c) { + case '\\': case '"': case '\b': case '\f': + case '\n': case '\r': case '\t': + output_size += 2; break; + default: + output_size += c >= 0x10000 ? 12 : 6; + } + } + } - /* One char input can be up to 6 chars output, estimate 4 of these */ - output_size = 2 + (MIN_EXPANSION * 4) + input_chars; - max_output_size = 2 + (input_chars * MAX_EXPANSION); - rval = PyUnicode_FromStringAndSize(NULL, output_size); + rval = PyUnicode_New(output_size, 127); if (rval == NULL) { return NULL; } - output = PyUnicode_AS_UNICODE(rval); + output = PyUnicode_1BYTE_DATA(rval); chars = 0; output[chars++] = '"'; for (i = 0; i < input_chars; i++) { - Py_UNICODE c = input_unicode[i]; + Py_UCS4 c = PyUnicode_READ(kind, input, i); if (S_CHAR(c)) { output[chars++] = c; } else { chars = ascii_escape_unichar(c, output, chars); } - if (output_size - chars < (1 + MAX_EXPANSION)) { - /* There's more than four, so let's resize by a lot */ - Py_ssize_t new_output_size = output_size * 2; - /* This is an upper bound */ - if (new_output_size > max_output_size) { - new_output_size = max_output_size; - } - /* Make sure that the output size changed before resizing */ - if (new_output_size != output_size) { - output_size = new_output_size; - if (PyUnicode_Resize(&rval, output_size) == -1) { - return NULL; - } - output = PyUnicode_AS_UNICODE(rval); - } - } } output[chars++] = '"'; - if (PyUnicode_Resize(&rval, chars) == -1) { - return NULL; - } return rval; } @@ -436,22 +425,30 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next Return value is a new PyUnicode */ PyObject *rval = NULL; - Py_ssize_t len = PyUnicode_GET_SIZE(pystr); + Py_ssize_t len; Py_ssize_t begin = end - 1; Py_ssize_t next /* = begin */; - const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr); + const void *buf; + int kind; PyObject *chunks = NULL; PyObject *chunk = NULL; + if (PyUnicode_READY(pystr) == -1) + return 0; + + len = PyUnicode_GET_LENGTH(pystr); + buf = PyUnicode_DATA(pystr); + kind = PyUnicode_KIND(pystr); + if (end < 0 || len <= end) { PyErr_SetString(PyExc_ValueError, "end is out of bounds"); goto bail; } while (1) { /* Find the end of the string or the next escape */ - Py_UNICODE c = 0; + Py_UCS4 c = 0; for (next = end; next < len; next++) { - c = buf[next]; + c = PyUnicode_READ(kind, buf, next); if (c == '"' || c == '\\') { break; } @@ -467,7 +464,10 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next /* Pick up this chunk if it's not zero length */ if (next != end) { APPEND_OLD_CHUNK - chunk = PyUnicode_FromUnicode(&buf[end], next - end); + chunk = PyUnicode_FromKindAndData( + kind, + (char*)buf + PyUnicode_KIND_SIZE(kind, end), + next - end); if (chunk == NULL) { goto bail; } @@ -481,7 +481,7 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next raise_errmsg("Unterminated string starting at", pystr, begin); goto bail; } - c = buf[next]; + c = PyUnicode_READ(kind, buf, next); if (c != 'u') { /* Non-unicode backslash escapes */ end = next + 1; @@ -511,7 +511,7 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next } /* Decode 4 hex digits */ for (; next < end; next++) { - Py_UNICODE digit = buf[next]; + Py_UCS4 digit = PyUnicode_READ(kind, buf, next); c <<= 4; switch (digit) { case '0': case '1': case '2': case '3': case '4': @@ -528,22 +528,22 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next goto bail; } } -#ifdef Py_UNICODE_WIDE /* Surrogate pair */ if ((c & 0xfc00) == 0xd800) { - Py_UNICODE c2 = 0; + Py_UCS4 c2 = 0; if (end + 6 >= len) { raise_errmsg("Unpaired high surrogate", pystr, end - 5); goto bail; } - if (buf[next++] != '\\' || buf[next++] != 'u') { + if (PyUnicode_READ(kind, buf, next++) != '\\' || + PyUnicode_READ(kind, buf, next++) != 'u') { raise_errmsg("Unpaired high surrogate", pystr, end - 5); goto bail; } end += 6; /* Decode 4 hex digits */ for (; next < end; next++) { - Py_UNICODE digit = buf[next]; + Py_UCS4 digit = PyUnicode_READ(kind, buf, next); c2 <<= 4; switch (digit) { case '0': case '1': case '2': case '3': case '4': @@ -570,10 +570,9 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next raise_errmsg("Unpaired low surrogate", pystr, end - 5); goto bail; } -#endif } APPEND_OLD_CHUNK - chunk = PyUnicode_FromUnicode(&c, 1); + chunk = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &c, 1); if (chunk == NULL) { goto bail; } @@ -711,8 +710,9 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss Returns a new PyObject (usually a dict, but object_hook can change that) */ - Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr); - Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1; + void *str; + int kind; + Py_ssize_t end_idx; PyObject *val = NULL; PyObject *rval = NULL; PyObject *key = NULL; @@ -720,6 +720,13 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss int has_pairs_hook = (s->object_pairs_hook != Py_None); Py_ssize_t next_idx; + if (PyUnicode_READY(pystr) == -1) + return NULL; + + str = PyUnicode_DATA(pystr); + kind = PyUnicode_KIND(pystr); + end_idx = PyUnicode_GET_LENGTH(pystr) - 1; + if (has_pairs_hook) rval = PyList_New(0); else @@ -728,15 +735,15 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss return NULL; /* skip whitespace after { */ - while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + while (idx <= end_idx && IS_WHITESPACE(PyUnicode_READ(kind,str, idx))) idx++; /* only loop if the object is non-empty */ - if (idx <= end_idx && str[idx] != '}') { + if (idx <= end_idx && PyUnicode_READ(kind, str, idx) != '}') { while (idx <= end_idx) { PyObject *memokey; /* read key */ - if (str[idx] != '"') { + if (PyUnicode_READ(kind, str, idx) != '"') { raise_errmsg("Expecting property name", pystr, idx); goto bail; } @@ -756,13 +763,13 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss idx = next_idx; /* skip whitespace between key and : delimiter, read :, skip whitespace */ - while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; - if (idx > end_idx || str[idx] != ':') { + while (idx <= end_idx && IS_WHITESPACE(PyUnicode_READ(kind, str, idx))) idx++; + if (idx > end_idx || PyUnicode_READ(kind, str, idx) != ':') { raise_errmsg("Expecting : delimiter", pystr, idx); goto bail; } idx++; - while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + while (idx <= end_idx && IS_WHITESPACE(PyUnicode_READ(kind, str, idx))) idx++; /* read any JSON term */ val = scan_once_unicode(s, pystr, idx, &next_idx); @@ -790,26 +797,26 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss idx = next_idx; /* skip whitespace before } or , */ - while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + while (idx <= end_idx && IS_WHITESPACE(PyUnicode_READ(kind, str, idx))) idx++; /* bail if the object is closed or we didn't get the , delimiter */ if (idx > end_idx) break; - if (str[idx] == '}') { + if (PyUnicode_READ(kind, str, idx) == '}') { break; } - else if (str[idx] != ',') { + else if (PyUnicode_READ(kind, str, idx) != ',') { raise_errmsg("Expecting , delimiter", pystr, idx); goto bail; } idx++; /* skip whitespace after , delimiter */ - while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + while (idx <= end_idx && IS_WHITESPACE(PyUnicode_READ(kind, str, idx))) idx++; } } /* verify that idx < end_idx, str[idx] should be '}' */ - if (idx > end_idx || str[idx] != '}') { + if (idx > end_idx || PyUnicode_READ(kind, str, idx) != '}') { raise_errmsg("Expecting object", pystr, end_idx); goto bail; } @@ -845,19 +852,27 @@ _parse_array_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssi Returns a new PyList */ - Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr); - Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1; + void *str; + int kind; + Py_ssize_t end_idx; PyObject *val = NULL; PyObject *rval = PyList_New(0); Py_ssize_t next_idx; if (rval == NULL) return NULL; + if (PyUnicode_READY(pystr) == -1) + return NULL; + + str = PyUnicode_DATA(pystr); + kind = PyUnicode_KIND(pystr); + end_idx = PyUnicode_GET_LENGTH(pystr) - 1; + /* skip whitespace after [ */ - while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + while (idx <= end_idx && IS_WHITESPACE(PyUnicode_READ(kind, str, idx))) idx++; /* only loop if the array is non-empty */ - if (idx <= end_idx && str[idx] != ']') { + if (idx <= end_idx && PyUnicode_READ(kind, str, idx) != ']') { while (idx <= end_idx) { /* read any JSON term */ @@ -872,26 +887,26 @@ _parse_array_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssi idx = next_idx; /* skip whitespace between term and , */ - while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + while (idx <= end_idx && IS_WHITESPACE(PyUnicode_READ(kind, str, idx))) idx++; /* bail if the array is closed or we didn't get the , delimiter */ if (idx > end_idx) break; - if (str[idx] == ']') { + if (PyUnicode_READ(kind, str, idx) == ']') { break; } - else if (str[idx] != ',') { + else if (PyUnicode_READ(kind, str, idx) != ',') { raise_errmsg("Expecting , delimiter", pystr, idx); goto bail; } idx++; /* skip whitespace after , */ - while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; + while (idx <= end_idx && IS_WHITESPACE(PyUnicode_READ(kind, str, idx))) idx++; } } - /* verify that idx < end_idx, str[idx] should be ']' */ - if (idx > end_idx || str[idx] != ']') { + /* verify that idx < end_idx, PyUnicode_READ(kind, str, idx) should be ']' */ + if (idx > end_idx || PyUnicode_READ(kind, str, idx) != ']') { raise_errmsg("Expecting object", pystr, end_idx); goto bail; } @@ -940,16 +955,24 @@ _match_number_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_ PyInt, PyLong, or PyFloat. May return other types if parse_int or parse_float are set */ - Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr); - Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1; + void *str; + int kind; + Py_ssize_t end_idx; Py_ssize_t idx = start; int is_float = 0; PyObject *rval; PyObject *numstr = NULL; PyObject *custom_func; + if (PyUnicode_READY(pystr) == -1) + return NULL; + + str = PyUnicode_DATA(pystr); + kind = PyUnicode_KIND(pystr); + end_idx = PyUnicode_GET_LENGTH(pystr) - 1; + /* read a sign if it's there, make sure it's not the end of the string */ - if (str[idx] == '-') { + if (PyUnicode_READ(kind, str, idx) == '-') { idx++; if (idx > end_idx) { PyErr_SetNone(PyExc_StopIteration); @@ -958,12 +981,12 @@ _match_number_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_ } /* read as many integer digits as we find as long as it doesn't start with 0 */ - if (str[idx] >= '1' && str[idx] <= '9') { + if (PyUnicode_READ(kind, str, idx) >= '1' && PyUnicode_READ(kind, str, idx) <= '9') { idx++; - while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++; + while (idx <= end_idx && PyUnicode_READ(kind, str, idx) >= '0' && PyUnicode_READ(kind, str, idx) <= '9') idx++; } /* if it starts with 0 we only expect one integer digit */ - else if (str[idx] == '0') { + else if (PyUnicode_READ(kind, str, idx) == '0') { idx++; } /* no integer digits, error */ @@ -973,25 +996,25 @@ _match_number_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_ } /* if the next char is '.' followed by a digit then read all float digits */ - if (idx < end_idx && str[idx] == '.' && str[idx + 1] >= '0' && str[idx + 1] <= '9') { + if (idx < end_idx && PyUnicode_READ(kind, str, idx) == '.' && PyUnicode_READ(kind, str, idx + 1) >= '0' && PyUnicode_READ(kind, str, idx + 1) <= '9') { is_float = 1; idx += 2; - while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++; + while (idx <= end_idx && PyUnicode_READ(kind, str, idx) >= '0' && PyUnicode_READ(kind, str, idx) <= '9') idx++; } /* if the next char is 'e' or 'E' then maybe read the exponent (or backtrack) */ - if (idx < end_idx && (str[idx] == 'e' || str[idx] == 'E')) { + if (idx < end_idx && (PyUnicode_READ(kind, str, idx) == 'e' || PyUnicode_READ(kind, str, idx) == 'E')) { Py_ssize_t e_start = idx; idx++; /* read an exponent sign if present */ - if (idx < end_idx && (str[idx] == '-' || str[idx] == '+')) idx++; + if (idx < end_idx && (PyUnicode_READ(kind, str, idx) == '-' || PyUnicode_READ(kind, str, idx) == '+')) idx++; /* read all digits */ - while (idx <= end_idx && str[idx] >= '0' && str[idx] <= '9') idx++; + while (idx <= end_idx && PyUnicode_READ(kind, str, idx) >= '0' && PyUnicode_READ(kind, str, idx) <= '9') idx++; /* if we got a digit, then parse as float. if not, backtrack */ - if (str[idx - 1] >= '0' && str[idx - 1] <= '9') { + if (PyUnicode_READ(kind, str, idx - 1) >= '0' && PyUnicode_READ(kind, str, idx - 1) <= '9') { is_float = 1; } else { @@ -1008,7 +1031,9 @@ _match_number_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_ if (custom_func) { /* copy the section we determined to be a number */ - numstr = PyUnicode_FromUnicode(&str[start], idx - start); + numstr = PyUnicode_FromKindAndData(kind, + (char*)str + PyUnicode_KIND_SIZE(kind, start), + idx - start); if (numstr == NULL) return NULL; rval = PyObject_CallFunctionObjArgs(custom_func, numstr, NULL); @@ -1024,7 +1049,7 @@ _match_number_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t start, Py_ return NULL; buf = PyBytes_AS_STRING(numstr); for (i = 0; i < n; i++) { - buf[i] = (char) str[i + start]; + buf[i] = (char) PyUnicode_READ(kind, str, i + start); } if (is_float) rval = PyFloat_FromString(numstr); @@ -1047,13 +1072,23 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_ Returns a new PyObject representation of the term. */ PyObject *res; - Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr); - Py_ssize_t length = PyUnicode_GET_SIZE(pystr); + void *str; + int kind; + Py_ssize_t length; + + if (PyUnicode_READY(pystr) == -1) + return NULL; + + str = PyUnicode_DATA(pystr); + kind = PyUnicode_KIND(pystr); + length = PyUnicode_GET_LENGTH(pystr); + if (idx >= length) { PyErr_SetNone(PyExc_StopIteration); return NULL; } - switch (str[idx]) { + + switch (PyUnicode_READ(kind, str, idx)) { case '"': /* string */ return scanstring_unicode(pystr, idx + 1, @@ -1077,7 +1112,7 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_ return res; case 'n': /* null */ - if ((idx + 3 < length) && str[idx + 1] == 'u' && str[idx + 2] == 'l' && str[idx + 3] == 'l') { + if ((idx + 3 < length) && PyUnicode_READ(kind, str, idx + 1) == 'u' && PyUnicode_READ(kind, str, idx + 2) == 'l' && PyUnicode_READ(kind, str, idx + 3) == 'l') { Py_INCREF(Py_None); *next_idx_ptr = idx + 4; return Py_None; @@ -1085,7 +1120,7 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_ break; case 't': /* true */ - if ((idx + 3 < length) && str[idx + 1] == 'r' && str[idx + 2] == 'u' && str[idx + 3] == 'e') { + if ((idx + 3 < length) && PyUnicode_READ(kind, str, idx + 1) == 'r' && PyUnicode_READ(kind, str, idx + 2) == 'u' && PyUnicode_READ(kind, str, idx + 3) == 'e') { Py_INCREF(Py_True); *next_idx_ptr = idx + 4; return Py_True; @@ -1093,7 +1128,10 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_ break; case 'f': /* false */ - if ((idx + 4 < length) && str[idx + 1] == 'a' && str[idx + 2] == 'l' && str[idx + 3] == 's' && str[idx + 4] == 'e') { + if ((idx + 4 < length) && PyUnicode_READ(kind, str, idx + 1) == 'a' && + PyUnicode_READ(kind, str, idx + 2) == 'l' && + PyUnicode_READ(kind, str, idx + 3) == 's' && + PyUnicode_READ(kind, str, idx + 4) == 'e') { Py_INCREF(Py_False); *next_idx_ptr = idx + 5; return Py_False; @@ -1101,19 +1139,33 @@ scan_once_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ssize_ break; case 'N': /* NaN */ - if ((idx + 2 < length) && str[idx + 1] == 'a' && str[idx + 2] == 'N') { + if ((idx + 2 < length) && PyUnicode_READ(kind, str, idx + 1) == 'a' && + PyUnicode_READ(kind, str, idx + 2) == 'N') { return _parse_constant(s, "NaN", idx, next_idx_ptr); } break; case 'I': /* Infinity */ - if ((idx + 7 < length) && str[idx + 1] == 'n' && str[idx + 2] == 'f' && str[idx + 3] == 'i' && str[idx + 4] == 'n' && str[idx + 5] == 'i' && str[idx + 6] == 't' && str[idx + 7] == 'y') { + if ((idx + 7 < length) && PyUnicode_READ(kind, str, idx + 1) == 'n' && + PyUnicode_READ(kind, str, idx + 2) == 'f' && + PyUnicode_READ(kind, str, idx + 3) == 'i' && + PyUnicode_READ(kind, str, idx + 4) == 'n' && + PyUnicode_READ(kind, str, idx + 5) == 'i' && + PyUnicode_READ(kind, str, idx + 6) == 't' && + PyUnicode_READ(kind, str, idx + 7) == 'y') { return _parse_constant(s, "Infinity", idx, next_idx_ptr); } break; case '-': /* -Infinity */ - if ((idx + 8 < length) && str[idx + 1] == 'I' && str[idx + 2] == 'n' && str[idx + 3] == 'f' && str[idx + 4] == 'i' && str[idx + 5] == 'n' && str[idx + 6] == 'i' && str[idx + 7] == 't' && str[idx + 8] == 'y') { + if ((idx + 8 < length) && PyUnicode_READ(kind, str, idx + 1) == 'I' && + PyUnicode_READ(kind, str, idx + 2) == 'n' && + PyUnicode_READ(kind, str, idx + 3) == 'f' && + PyUnicode_READ(kind, str, idx + 4) == 'i' && + PyUnicode_READ(kind, str, idx + 5) == 'n' && + PyUnicode_READ(kind, str, idx + 6) == 'i' && + PyUnicode_READ(kind, str, idx + 7) == 't' && + PyUnicode_READ(kind, str, idx + 8) == 'y') { return _parse_constant(s, "-Infinity", idx, next_idx_ptr); } break; diff --git a/Modules/_pickle.c b/Modules/_pickle.c index 4389f72..19fac40 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -1867,9 +1867,7 @@ save_unicode(PicklerObject *self, PyObject *obj) if (self->bin) { char pdata[5]; - encoded = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(obj), - PyUnicode_GET_SIZE(obj), - "surrogatepass"); + encoded = PyUnicode_AsEncodedString(obj, "utf-8", "surrogatepass"); if (encoded == NULL) goto error; diff --git a/Modules/_sqlite/connection.c b/Modules/_sqlite/connection.c index 3c83c7c..310a27c 100644 --- a/Modules/_sqlite/connection.c +++ b/Modules/_sqlite/connection.c @@ -1436,10 +1436,11 @@ pysqlite_connection_create_collation(pysqlite_Connection* self, PyObject* args) PyObject* uppercase_name = 0; PyObject* name; PyObject* retval; - Py_UNICODE* chk; Py_ssize_t i, len; char *uppercase_name_str; int rc; + unsigned int kind; + void *data; if (!pysqlite_check_thread(self) || !pysqlite_check_connection(self)) { goto finally; @@ -1454,12 +1455,16 @@ pysqlite_connection_create_collation(pysqlite_Connection* self, PyObject* args) goto finally; } - len = PyUnicode_GET_SIZE(uppercase_name); - chk = PyUnicode_AS_UNICODE(uppercase_name); - for (i=0; i<len; i++, chk++) { - if ((*chk >= '0' && *chk <= '9') - || (*chk >= 'A' && *chk <= 'Z') - || (*chk == '_')) + if (PyUnicode_READY(uppercase_name)) + goto finally; + len = PyUnicode_GET_LENGTH(uppercase_name); + kind = PyUnicode_KIND(uppercase_name); + data = PyUnicode_DATA(uppercase_name); + for (i=0; i<len; i++) { + Py_UCS4 ch = PyUnicode_READ(kind, data, i); + if ((ch >= '0' && ch <= '9') + || (ch >= 'A' && ch <= 'Z') + || (ch == '_')) { continue; } else { diff --git a/Modules/_sre.c b/Modules/_sre.c index 2b764b1..bceb9d6 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -163,8 +163,6 @@ static unsigned int sre_lower_locale(unsigned int ch) /* unicode-specific character predicates */ -#if defined(HAVE_UNICODE) - #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch)) #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch)) #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch)) @@ -176,8 +174,6 @@ static unsigned int sre_lower_unicode(unsigned int ch) return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch)); } -#endif - LOCAL(int) sre_category(SRE_CODE category, unsigned int ch) { @@ -205,7 +201,6 @@ sre_category(SRE_CODE category, unsigned int ch) case SRE_CATEGORY_LOC_NOT_WORD: return !SRE_LOC_IS_WORD(ch); -#if defined(HAVE_UNICODE) case SRE_CATEGORY_UNI_DIGIT: return SRE_UNI_IS_DIGIT(ch); case SRE_CATEGORY_UNI_NOT_DIGIT: @@ -222,24 +217,6 @@ sre_category(SRE_CODE category, unsigned int ch) return SRE_UNI_IS_LINEBREAK(ch); case SRE_CATEGORY_UNI_NOT_LINEBREAK: return !SRE_UNI_IS_LINEBREAK(ch); -#else - case SRE_CATEGORY_UNI_DIGIT: - return SRE_IS_DIGIT(ch); - case SRE_CATEGORY_UNI_NOT_DIGIT: - return !SRE_IS_DIGIT(ch); - case SRE_CATEGORY_UNI_SPACE: - return SRE_IS_SPACE(ch); - case SRE_CATEGORY_UNI_NOT_SPACE: - return !SRE_IS_SPACE(ch); - case SRE_CATEGORY_UNI_WORD: - return SRE_LOC_IS_WORD(ch); - case SRE_CATEGORY_UNI_NOT_WORD: - return !SRE_LOC_IS_WORD(ch); - case SRE_CATEGORY_UNI_LINEBREAK: - return SRE_IS_LINEBREAK(ch); - case SRE_CATEGORY_UNI_NOT_LINEBREAK: - return !SRE_IS_LINEBREAK(ch); -#endif } return 0; } @@ -280,6 +257,7 @@ data_stack_grow(SRE_STATE* state, Py_ssize_t size) /* generate 8-bit version */ #define SRE_CHAR unsigned char +#define SRE_CHARGET(state, buf, index) ((unsigned char*)buf)[index] #define SRE_AT sre_at #define SRE_COUNT sre_count #define SRE_CHARSET sre_charset @@ -287,15 +265,11 @@ data_stack_grow(SRE_STATE* state, Py_ssize_t size) #define SRE_MATCH sre_match #define SRE_MATCH_CONTEXT sre_match_context #define SRE_SEARCH sre_search -#define SRE_LITERAL_TEMPLATE sre_literal_template - -#if defined(HAVE_UNICODE) #define SRE_RECURSIVE #include "_sre.c" #undef SRE_RECURSIVE -#undef SRE_LITERAL_TEMPLATE #undef SRE_SEARCH #undef SRE_MATCH #undef SRE_MATCH_CONTEXT @@ -304,10 +278,15 @@ data_stack_grow(SRE_STATE* state, Py_ssize_t size) #undef SRE_COUNT #undef SRE_AT #undef SRE_CHAR +#undef SRE_CHARGET -/* generate 16-bit unicode version */ +/* generate 8/16/32-bit unicode version */ -#define SRE_CHAR Py_UNICODE +#define SRE_CHAR void +#define SRE_CHARGET(state, buf, index) \ + ((state->charsize==1) ? ((Py_UCS1*)buf)[index] : \ + (state->charsize==2) ? ((Py_UCS2*)buf)[index] : \ + ((Py_UCS4*)buf)[index]) #define SRE_AT sre_uat #define SRE_COUNT sre_ucount #define SRE_CHARSET sre_ucharset @@ -315,8 +294,6 @@ data_stack_grow(SRE_STATE* state, Py_ssize_t size) #define SRE_MATCH sre_umatch #define SRE_MATCH_CONTEXT sre_umatch_context #define SRE_SEARCH sre_usearch -#define SRE_LITERAL_TEMPLATE sre_uliteral_template -#endif #endif /* SRE_RECURSIVE */ @@ -327,7 +304,7 @@ data_stack_grow(SRE_STATE* state, Py_ssize_t size) settings */ LOCAL(int) -SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) +SRE_AT(SRE_STATE* state, char* ptr, SRE_CODE at) { /* check if pointer is at given position */ @@ -341,16 +318,16 @@ SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) case SRE_AT_BEGINNING_LINE: return ((void*) ptr == state->beginning || - SRE_IS_LINEBREAK((int) ptr[-1])); + SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, -1))); case SRE_AT_END: - return (((void*) (ptr+1) == state->end && - SRE_IS_LINEBREAK((int) ptr[0])) || + return (((void*) (ptr+state->charsize) == state->end && + SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, 0))) || ((void*) ptr == state->end)); case SRE_AT_END_LINE: return ((void*) ptr == state->end || - SRE_IS_LINEBREAK((int) ptr[0])); + SRE_IS_LINEBREAK((int) SRE_CHARGET(state, ptr, 0))); case SRE_AT_END_STRING: return ((void*) ptr == state->end); @@ -359,57 +336,55 @@ SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) if (state->beginning == state->end) return 0; thatp = ((void*) ptr > state->beginning) ? - SRE_IS_WORD((int) ptr[-1]) : 0; + SRE_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0; thisp = ((void*) ptr < state->end) ? - SRE_IS_WORD((int) ptr[0]) : 0; + SRE_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0; return thisp != thatp; case SRE_AT_NON_BOUNDARY: if (state->beginning == state->end) return 0; thatp = ((void*) ptr > state->beginning) ? - SRE_IS_WORD((int) ptr[-1]) : 0; + SRE_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0; thisp = ((void*) ptr < state->end) ? - SRE_IS_WORD((int) ptr[0]) : 0; + SRE_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0; return thisp == thatp; case SRE_AT_LOC_BOUNDARY: if (state->beginning == state->end) return 0; thatp = ((void*) ptr > state->beginning) ? - SRE_LOC_IS_WORD((int) ptr[-1]) : 0; + SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0; thisp = ((void*) ptr < state->end) ? - SRE_LOC_IS_WORD((int) ptr[0]) : 0; + SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0; return thisp != thatp; case SRE_AT_LOC_NON_BOUNDARY: if (state->beginning == state->end) return 0; thatp = ((void*) ptr > state->beginning) ? - SRE_LOC_IS_WORD((int) ptr[-1]) : 0; + SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0; thisp = ((void*) ptr < state->end) ? - SRE_LOC_IS_WORD((int) ptr[0]) : 0; + SRE_LOC_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0; return thisp == thatp; -#if defined(HAVE_UNICODE) case SRE_AT_UNI_BOUNDARY: if (state->beginning == state->end) return 0; thatp = ((void*) ptr > state->beginning) ? - SRE_UNI_IS_WORD((int) ptr[-1]) : 0; + SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0; thisp = ((void*) ptr < state->end) ? - SRE_UNI_IS_WORD((int) ptr[0]) : 0; + SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0; return thisp != thatp; case SRE_AT_UNI_NON_BOUNDARY: if (state->beginning == state->end) return 0; thatp = ((void*) ptr > state->beginning) ? - SRE_UNI_IS_WORD((int) ptr[-1]) : 0; + SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, -1)) : 0; thisp = ((void*) ptr < state->end) ? - SRE_UNI_IS_WORD((int) ptr[0]) : 0; + SRE_UNI_IS_WORD((int) SRE_CHARGET(state, ptr, 0)) : 0; return thisp == thatp; -#endif } @@ -476,7 +451,7 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch) count = *(set++); if (sizeof(SRE_CODE) == 2) { - block = ((unsigned char*)set)[ch >> 8]; + block = ((char*)set)[ch >> 8]; set += 128; if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15))) return ok; @@ -486,7 +461,7 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch) /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids * warnings when c's type supports only numbers < N+1 */ if (!(ch & ~65535)) - block = ((unsigned char*)set)[ch >> 8]; + block = ((char*)set)[ch >> 8]; else block = -1; set += 64; @@ -512,28 +487,29 @@ LOCAL(Py_ssize_t) SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) { SRE_CODE chr; - SRE_CHAR* ptr = (SRE_CHAR *)state->ptr; - SRE_CHAR* end = (SRE_CHAR *)state->end; + char* ptr = (char *)state->ptr; + char* end = (char *)state->end; Py_ssize_t i; /* adjust end */ if (maxcount < end - ptr && maxcount != 65535) - end = ptr + maxcount; + end = ptr + maxcount*state->charsize; switch (pattern[0]) { case SRE_OP_IN: /* repeated set */ TRACE(("|%p|%p|COUNT IN\n", pattern, ptr)); - while (ptr < end && SRE_CHARSET(pattern + 2, *ptr)) - ptr++; + while (ptr < end && + SRE_CHARSET(pattern + 2, SRE_CHARGET(state, ptr, 0))) + ptr += state->charsize; break; case SRE_OP_ANY: /* repeated dot wildcard. */ TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr)); - while (ptr < end && !SRE_IS_LINEBREAK(*ptr)) - ptr++; + while (ptr < end && !SRE_IS_LINEBREAK(SRE_CHARGET(state, ptr, 0))) + ptr += state->charsize; break; case SRE_OP_ANY_ALL: @@ -547,38 +523,38 @@ SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) /* repeated literal */ chr = pattern[1]; TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr)); - while (ptr < end && (SRE_CODE) *ptr == chr) - ptr++; + while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) == chr) + ptr += state->charsize; break; case SRE_OP_LITERAL_IGNORE: /* repeated literal */ chr = pattern[1]; TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr)); - while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr) - ptr++; + while (ptr < end && (SRE_CODE) state->lower(SRE_CHARGET(state, ptr, 0)) == chr) + ptr += state->charsize; break; case SRE_OP_NOT_LITERAL: /* repeated non-literal */ chr = pattern[1]; TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr)); - while (ptr < end && (SRE_CODE) *ptr != chr) - ptr++; + while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) != chr) + ptr += state->charsize; break; case SRE_OP_NOT_LITERAL_IGNORE: /* repeated non-literal */ chr = pattern[1]; TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr)); - while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr) - ptr++; + while (ptr < end && (SRE_CODE) state->lower(SRE_CHARGET(state, ptr, 0)) != chr) + ptr += state->charsize; break; default: /* repeated single character pattern */ TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr)); - while ((SRE_CHAR*) state->ptr < end) { + while ((char*) state->ptr < end) { i = SRE_MATCH(state, pattern); if (i < 0) return i; @@ -586,12 +562,12 @@ SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) break; } TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, - (SRE_CHAR*) state->ptr - ptr)); - return (SRE_CHAR*) state->ptr - ptr; + ((char*)state->ptr - ptr)/state->charsize)); + return ((char*)state->ptr - ptr)/state->charsize; } - TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr)); - return ptr - (SRE_CHAR*) state->ptr; + TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, (ptr - (char*) state->ptr)/state->charsize)); + return (ptr - (char*) state->ptr)/state->charsize; } #if 0 /* not used in this release */ @@ -602,8 +578,8 @@ SRE_INFO(SRE_STATE* state, SRE_CODE* pattern) returns the number of SRE_CODE objects to skip if successful, 0 if no match */ - SRE_CHAR* end = state->end; - SRE_CHAR* ptr = state->ptr; + char* end = state->end; + char* ptr = state->ptr; Py_ssize_t i; /* check minimal length */ @@ -614,7 +590,7 @@ SRE_INFO(SRE_STATE* state, SRE_CODE* pattern) if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) { /* <length> <skip> <prefix data> <overlap data> */ for (i = 0; i < pattern[5]; i++) - if ((SRE_CODE) ptr[i] != pattern[7 + i]) + if ((SRE_CODE) SRE_CHARGET(state, ptr, i) != pattern[7 + i]) return 0; return pattern[0] + 2 * pattern[6]; } @@ -783,7 +759,7 @@ do { \ typedef struct { Py_ssize_t last_ctx_pos; Py_ssize_t jump; - SRE_CHAR* ptr; + char* ptr; SRE_CODE* pattern; Py_ssize_t count; Py_ssize_t lastmark; @@ -799,7 +775,7 @@ typedef struct { LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) { - SRE_CHAR* end = (SRE_CHAR *)state->end; + char* end = (char*)state->end; Py_ssize_t alloc_pos, ctx_pos = -1; Py_ssize_t i, ret = 0; Py_ssize_t jump; @@ -818,12 +794,12 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) entrance: - ctx->ptr = (SRE_CHAR *)state->ptr; + ctx->ptr = (char *)state->ptr; if (ctx->pattern[0] == SRE_OP_INFO) { /* optimization info block */ /* <INFO> <1=skip> <2=flags> <3=min> ... */ - if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) { + if (ctx->pattern[3] && (end - ctx->ptr)/state->charsize < ctx->pattern[3]) { TRACE(("reject (got %d chars, need %d)\n", (end - ctx->ptr), ctx->pattern[3])); RETURN_FAILURE; @@ -865,10 +841,10 @@ entrance: /* <LITERAL> <code> */ TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern, ctx->ptr, *ctx->pattern)); - if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0]) + if (ctx->ptr >= end || (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) != ctx->pattern[0]) RETURN_FAILURE; ctx->pattern++; - ctx->ptr++; + ctx->ptr += state->charsize; break; case SRE_OP_NOT_LITERAL: @@ -876,10 +852,10 @@ entrance: /* <NOT_LITERAL> <code> */ TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern, ctx->ptr, *ctx->pattern)); - if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0]) + if (ctx->ptr >= end || (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) == ctx->pattern[0]) RETURN_FAILURE; ctx->pattern++; - ctx->ptr++; + ctx->ptr += state->charsize; break; case SRE_OP_SUCCESS: @@ -902,19 +878,19 @@ entrance: /* <CATEGORY> <code> */ TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern, ctx->ptr, *ctx->pattern)); - if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0])) + if (ctx->ptr >= end || !sre_category(ctx->pattern[0], SRE_CHARGET(state, ctx->ptr, 0))) RETURN_FAILURE; ctx->pattern++; - ctx->ptr++; + ctx->ptr += state->charsize; break; case SRE_OP_ANY: /* match anything (except a newline) */ /* <ANY> */ TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr)); - if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0])) - RETURN_FAILURE; - ctx->ptr++; + if (ctx->ptr >= end || SRE_IS_LINEBREAK(SRE_CHARGET(state, ctx->ptr, 0))) + RETURN_FAILURE; + ctx->ptr += state->charsize; break; case SRE_OP_ANY_ALL: @@ -923,47 +899,47 @@ entrance: TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr)); if (ctx->ptr >= end) RETURN_FAILURE; - ctx->ptr++; + ctx->ptr += state->charsize; break; case SRE_OP_IN: /* match set member (or non_member) */ /* <IN> <skip> <set> */ TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr)); - if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr)) - RETURN_FAILURE; + if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, SRE_CHARGET(state, ctx->ptr, 0))) + RETURN_FAILURE; ctx->pattern += ctx->pattern[0]; - ctx->ptr++; + ctx->ptr += state->charsize; break; case SRE_OP_LITERAL_IGNORE: TRACE(("|%p|%p|LITERAL_IGNORE %d\n", ctx->pattern, ctx->ptr, ctx->pattern[0])); if (ctx->ptr >= end || - state->lower(*ctx->ptr) != state->lower(*ctx->pattern)) + state->lower(SRE_CHARGET(state, ctx->ptr, 0)) != state->lower(*ctx->pattern)) RETURN_FAILURE; ctx->pattern++; - ctx->ptr++; + ctx->ptr += state->charsize; break; case SRE_OP_NOT_LITERAL_IGNORE: TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n", ctx->pattern, ctx->ptr, *ctx->pattern)); if (ctx->ptr >= end || - state->lower(*ctx->ptr) == state->lower(*ctx->pattern)) + state->lower(SRE_CHARGET(state, ctx->ptr, 0)) == state->lower(*ctx->pattern)) RETURN_FAILURE; ctx->pattern++; - ctx->ptr++; + ctx->ptr += state->charsize; break; case SRE_OP_IN_IGNORE: TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr)); if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern+1, - (SRE_CODE)state->lower(*ctx->ptr))) + (SRE_CODE)state->lower(SRE_CHARGET(state, ctx->ptr, 0)))) RETURN_FAILURE; ctx->pattern += ctx->pattern[0]; - ctx->ptr++; + ctx->ptr += state->charsize; break; case SRE_OP_JUMP: @@ -986,11 +962,11 @@ entrance: for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) { if (ctx->pattern[1] == SRE_OP_LITERAL && (ctx->ptr >= end || - (SRE_CODE) *ctx->ptr != ctx->pattern[2])) + (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0) != ctx->pattern[2])) continue; if (ctx->pattern[1] == SRE_OP_IN && (ctx->ptr >= end || - !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr))) + !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) SRE_CHARGET(state, ctx->ptr, 0)))) continue; state->ptr = ctx->ptr; DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1); @@ -1021,7 +997,7 @@ entrance: TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr, ctx->pattern[1], ctx->pattern[2])); - if (ctx->ptr + ctx->pattern[1] > end) + if (ctx->ptr + state->charsize * ctx->pattern[1] > end) RETURN_FAILURE; /* cannot match */ state->ptr = ctx->ptr; @@ -1030,7 +1006,7 @@ entrance: RETURN_ON_ERROR(ret); DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos); ctx->count = ret; - ctx->ptr += ctx->count; + ctx->ptr += state->charsize * ctx->count; /* when we arrive here, count contains the number of matches, and ctx->ptr points to the tail of the target @@ -1054,8 +1030,9 @@ entrance: ctx->u.chr = ctx->pattern[ctx->pattern[0]+1]; for (;;) { while (ctx->count >= (Py_ssize_t) ctx->pattern[1] && - (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) { - ctx->ptr--; + (ctx->ptr >= end || + SRE_CHARGET(state, ctx->ptr, 0) != ctx->u.chr)) { + ctx->ptr -= state->charsize; ctx->count--; } if (ctx->count < (Py_ssize_t) ctx->pattern[1]) @@ -1070,7 +1047,7 @@ entrance: LASTMARK_RESTORE(); - ctx->ptr--; + ctx->ptr -= state->charsize; ctx->count--; } @@ -1084,7 +1061,7 @@ entrance: RETURN_ON_ERROR(ret); RETURN_SUCCESS; } - ctx->ptr--; + ctx->ptr -= state->charsize; ctx->count--; LASTMARK_RESTORE(); } @@ -1104,7 +1081,7 @@ entrance: TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr, ctx->pattern[1], ctx->pattern[2])); - if (ctx->ptr + ctx->pattern[1] > end) + if (ctx->ptr + state->charsize * ctx->pattern[1] > end) RETURN_FAILURE; /* cannot match */ state->ptr = ctx->ptr; @@ -1121,7 +1098,7 @@ entrance: RETURN_FAILURE; /* advance past minimum matches of repeat */ ctx->count = ret; - ctx->ptr += ctx->count; + ctx->ptr += state->charsize * ctx->count; } if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) { @@ -1148,7 +1125,7 @@ entrance: if (ret == 0) break; assert(ret == 1); - ctx->ptr++; + ctx->ptr += state->charsize; ctx->count++; LASTMARK_RESTORE(); } @@ -1320,14 +1297,16 @@ entrance: if (groupref >= state->lastmark) { RETURN_FAILURE; } else { - SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; - SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; + char* p = (char*) state->mark[groupref]; + char* e = (char*) state->mark[groupref+1]; if (!p || !e || e < p) RETURN_FAILURE; while (p < e) { - if (ctx->ptr >= end || *ctx->ptr != *p) + if (ctx->ptr >= end || + SRE_CHARGET(state, ctx->ptr, 0) != SRE_CHARGET(state, p, 0)) RETURN_FAILURE; - p++; ctx->ptr++; + p += state->charsize; + ctx->ptr += state->charsize; } } } @@ -1344,15 +1323,16 @@ entrance: if (groupref >= state->lastmark) { RETURN_FAILURE; } else { - SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; - SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; + char* p = (char*) state->mark[groupref]; + char* e = (char*) state->mark[groupref+1]; if (!p || !e || e < p) RETURN_FAILURE; while (p < e) { if (ctx->ptr >= end || - state->lower(*ctx->ptr) != state->lower(*p)) + state->lower(SRE_CHARGET(state, ctx->ptr, 0)) != state->lower(*p)) RETURN_FAILURE; - p++; ctx->ptr++; + p++; + ctx->ptr += state->charsize; } } } @@ -1386,7 +1366,7 @@ entrance: /* <ASSERT> <skip> <back> <pattern> */ TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern, ctx->ptr, ctx->pattern[1])); - state->ptr = ctx->ptr - ctx->pattern[1]; + state->ptr = ctx->ptr - state->charsize * ctx->pattern[1]; if (state->ptr < state->beginning) RETURN_FAILURE; DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2); @@ -1399,7 +1379,7 @@ entrance: /* <ASSERT_NOT> <skip> <back> <pattern> */ TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern, ctx->ptr, ctx->pattern[1])); - state->ptr = ctx->ptr - ctx->pattern[1]; + state->ptr = ctx->ptr - state->charsize * ctx->pattern[1]; if (state->ptr >= state->beginning) { DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2); if (ret) { @@ -1481,8 +1461,8 @@ exit: LOCAL(Py_ssize_t) SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) { - SRE_CHAR* ptr = (SRE_CHAR *)state->start; - SRE_CHAR* end = (SRE_CHAR *)state->end; + char* ptr = (char*)state->start; + char* end = (char*)state->end; Py_ssize_t status = 0; Py_ssize_t prefix_len = 0; Py_ssize_t prefix_skip = 0; @@ -1500,9 +1480,9 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) if (pattern[3] > 1) { /* adjust end point (but make sure we leave at least one character in there, so literal search will work) */ - end -= pattern[3]-1; + end -= (pattern[3]-1) * state->charsize; if (end <= ptr) - end = ptr+1; + end = ptr + state->charsize; } if (flags & SRE_INFO_PREFIX) { @@ -1528,10 +1508,10 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) /* pattern starts with a known prefix. use the overlap table to skip forward as fast as we possibly can */ Py_ssize_t i = 0; - end = (SRE_CHAR *)state->end; + end = (char *)state->end; while (ptr < end) { for (;;) { - if ((SRE_CODE) ptr[0] != prefix[i]) { + if ((SRE_CODE) SRE_CHARGET(state, ptr, 0) != prefix[i]) { if (!i) break; else @@ -1540,8 +1520,8 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) if (++i == prefix_len) { /* found a potential match */ TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr)); - state->start = ptr + 1 - prefix_len; - state->ptr = ptr + 1 - prefix_len + prefix_skip; + state->start = ptr - (prefix_len - 1) * state->charsize; + state->ptr = ptr - (prefix_len - prefix_skip - 1) * state->charsize; if (flags & SRE_INFO_LITERAL) return 1; /* we got all of it */ status = SRE_MATCH(state, pattern + 2*prefix_skip); @@ -1553,7 +1533,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) break; } } - ptr++; + ptr += state->charsize; } return 0; } @@ -1563,15 +1543,16 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) /* pattern starts with a literal character. this is used for short prefixes, and if fast search is disabled */ SRE_CODE chr = pattern[1]; - end = (SRE_CHAR *)state->end; + end = (char*)state->end; for (;;) { - while (ptr < end && (SRE_CODE) ptr[0] != chr) - ptr++; + while (ptr < end && (SRE_CODE) SRE_CHARGET(state, ptr, 0) != chr) + ptr += state->charsize; if (ptr >= end) return 0; TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr)); state->start = ptr; - state->ptr = ++ptr; + ptr += state->charsize; + state->ptr = ptr; if (flags & SRE_INFO_LITERAL) return 1; /* we got all of it */ status = SRE_MATCH(state, pattern + 2); @@ -1580,10 +1561,10 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) } } else if (charset) { /* pattern starts with a character from a known set */ - end = (SRE_CHAR *)state->end; + end = (char*)state->end; for (;;) { - while (ptr < end && !SRE_CHARSET(charset, ptr[0])) - ptr++; + while (ptr < end && !SRE_CHARSET(charset, SRE_CHARGET(state, ptr, 0))) + ptr += state->charsize; if (ptr >= end) return 0; TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr)); @@ -1592,13 +1573,14 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) status = SRE_MATCH(state, pattern); if (status != 0) break; - ptr++; + ptr += state->charsize; } } else /* general case */ while (ptr <= end) { TRACE(("|%p|%p|SEARCH\n", pattern, ptr)); - state->start = state->ptr = ptr++; + state->start = state->ptr = ptr; + ptr += state->charsize; status = SRE_MATCH(state, pattern); if (status != 0) break; @@ -1607,16 +1589,6 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) return status; } -LOCAL(int) -SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len) -{ - /* check if given string is a literal template (i.e. no escapes) */ - while (len-- > 0) - if (*ptr++ == '\\') - return 0; - return 1; -} - #if !defined(SRE_RECURSIVE) /* -------------------------------------------------------------------- */ @@ -1626,6 +1598,23 @@ SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len) static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int); static PyObject*pattern_scanner(PatternObject*, PyObject*); +static int +sre_literal_template(int charsize, char* ptr, Py_ssize_t len) +{ + /* check if given string is a literal template (i.e. no escapes) */ + struct { + int charsize; + } state = { + charsize + }; + while (len-- > 0) { + if (SRE_CHARGET((&state), ptr, 0) == '\\') + return 0; + ptr += charsize; + } + return 1; +} + static PyObject * sre_codesize(PyObject* self, PyObject *unused) { @@ -1641,11 +1630,7 @@ sre_getlower(PyObject* self, PyObject* args) if (flags & SRE_FLAG_LOCALE) return Py_BuildValue("i", sre_lower_locale(character)); if (flags & SRE_FLAG_UNICODE) -#if defined(HAVE_UNICODE) return Py_BuildValue("i", sre_lower_unicode(character)); -#else - return Py_BuildValue("i", sre_lower_locale(character)); -#endif return Py_BuildValue("i", sre_lower(character)); } @@ -1664,7 +1649,8 @@ state_reset(SRE_STATE* state) } static void* -getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize) +getstring(PyObject* string, Py_ssize_t* p_length, + int* p_logical_charsize, int* p_charsize) { /* given a python object, return a data pointer, a length (in characters), and a character size. return NULL if the object @@ -1679,9 +1665,12 @@ getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize) /* Unicode objects do not support the buffer API. So, get the data directly instead. */ if (PyUnicode_Check(string)) { - ptr = (void *)PyUnicode_AS_DATA(string); - *p_length = PyUnicode_GET_SIZE(string); - *p_charsize = sizeof(Py_UNICODE); + if (PyUnicode_READY(string) == -1) + return NULL; + ptr = PyUnicode_DATA(string); + *p_length = PyUnicode_GET_LENGTH(string); + *p_charsize = PyUnicode_CHARACTER_SIZE(string); + *p_logical_charsize = 4; return ptr; } @@ -1713,10 +1702,8 @@ getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize) if (PyBytes_Check(string) || bytes == size) charsize = 1; -#if defined(HAVE_UNICODE) else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE))) charsize = sizeof(Py_UNICODE); -#endif else { PyErr_SetString(PyExc_TypeError, "buffer size mismatch"); return NULL; @@ -1724,6 +1711,7 @@ getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize) *p_length = size; *p_charsize = charsize; + *p_logical_charsize = charsize; if (ptr == NULL) { PyErr_SetString(PyExc_ValueError, @@ -1739,7 +1727,7 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, /* prepare state object */ Py_ssize_t length; - int charsize; + int logical_charsize, charsize; void* ptr; memset(state, 0, sizeof(SRE_STATE)); @@ -1747,16 +1735,16 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, state->lastmark = -1; state->lastindex = -1; - ptr = getstring(string, &length, &charsize); + ptr = getstring(string, &length, &logical_charsize, &charsize); if (!ptr) return NULL; - if (charsize == 1 && pattern->charsize > 1) { + if (logical_charsize == 1 && pattern->logical_charsize > 1) { PyErr_SetString(PyExc_TypeError, "can't use a string pattern on a bytes-like object"); return NULL; } - if (charsize > 1 && pattern->charsize == 1) { + if (logical_charsize > 1 && pattern->logical_charsize == 1) { PyErr_SetString(PyExc_TypeError, "can't use a bytes pattern on a string-like object"); return NULL; @@ -1773,6 +1761,7 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, else if (end > length) end = length; + state->logical_charsize = logical_charsize; state->charsize = charsize; state->beginning = ptr; @@ -1788,11 +1777,7 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, if (pattern->flags & SRE_FLAG_LOCALE) state->lower = sre_lower_locale; else if (pattern->flags & SRE_FLAG_UNICODE) -#if defined(HAVE_UNICODE) state->lower = sre_lower_unicode; -#else - state->lower = sre_lower_locale; -#endif else state->lower = sre_lower; @@ -1891,12 +1876,10 @@ pattern_match(PatternObject* self, PyObject* args, PyObject* kw) TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr)); - if (state.charsize == 1) { + if (state.logical_charsize == 1) { status = sre_match(&state, PatternObject_GetCode(self)); } else { -#if defined(HAVE_UNICODE) status = sre_umatch(&state, PatternObject_GetCode(self)); -#endif } TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); @@ -1928,12 +1911,10 @@ pattern_search(PatternObject* self, PyObject* args, PyObject* kw) TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr)); - if (state.charsize == 1) { + if (state.logical_charsize == 1) { status = sre_search(&state, PatternObject_GetCode(self)); } else { -#if defined(HAVE_UNICODE) status = sre_usearch(&state, PatternObject_GetCode(self)); -#endif } TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); @@ -2075,12 +2056,10 @@ pattern_findall(PatternObject* self, PyObject* args, PyObject* kw) state.ptr = state.start; - if (state.charsize == 1) { + if (state.logical_charsize == 1) { status = sre_search(&state, PatternObject_GetCode(self)); } else { -#if defined(HAVE_UNICODE) status = sre_usearch(&state, PatternObject_GetCode(self)); -#endif } if (PyErr_Occurred()) @@ -2205,12 +2184,10 @@ pattern_split(PatternObject* self, PyObject* args, PyObject* kw) state.ptr = state.start; - if (state.charsize == 1) { + if (state.logical_charsize == 1) { status = sre_search(&state, PatternObject_GetCode(self)); } else { -#if defined(HAVE_UNICODE) status = sre_usearch(&state, PatternObject_GetCode(self)); -#endif } if (PyErr_Occurred()) @@ -2295,7 +2272,7 @@ pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string, int status; Py_ssize_t n; Py_ssize_t i, b, e; - int bint; + int logical_charsize, charsize; int filter_is_callable; if (PyCallable_Check(ptemplate)) { @@ -2306,16 +2283,10 @@ pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string, } else { /* if not callable, check if it's a literal string */ int literal; - ptr = getstring(ptemplate, &n, &bint); - b = bint; + ptr = getstring(ptemplate, &n, &logical_charsize, &charsize); + b = charsize; if (ptr) { - if (b == 1) { - literal = sre_literal_template((unsigned char *)ptr, n); - } else { -#if defined(HAVE_UNICODE) - literal = sre_uliteral_template((Py_UNICODE *)ptr, n); -#endif - } + literal = sre_literal_template(b, ptr, n); } else { PyErr_Clear(); literal = 0; @@ -2357,12 +2328,10 @@ pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string, state.ptr = state.start; - if (state.charsize == 1) { + if (state.logical_charsize == 1) { status = sre_search(&state, PatternObject_GetCode(self)); } else { -#if defined(HAVE_UNICODE) status = sre_usearch(&state, PatternObject_GetCode(self)); -#endif } if (PyErr_Occurred()) @@ -2694,15 +2663,18 @@ _compile(PyObject* self_, PyObject* args) return NULL; } - if (pattern == Py_None) - self->charsize = -1; - else { - Py_ssize_t p_length; - if (!getstring(pattern, &p_length, &self->charsize)) { - Py_DECREF(self); - return NULL; - } - } + if (pattern == Py_None) { + self->logical_charsize = -1; + self->charsize = -1; + } + else { + Py_ssize_t p_length; + if (!getstring(pattern, &p_length, &self->logical_charsize, + &self->charsize)) { + Py_DECREF(self); + return NULL; + } + } Py_INCREF(pattern); self->pattern = pattern; @@ -3746,12 +3718,10 @@ scanner_match(ScannerObject* self, PyObject *unused) state->ptr = state->start; - if (state->charsize == 1) { + if (state->logical_charsize == 1) { status = sre_match(state, PatternObject_GetCode(self->pattern)); } else { -#if defined(HAVE_UNICODE) status = sre_umatch(state, PatternObject_GetCode(self->pattern)); -#endif } if (PyErr_Occurred()) return NULL; @@ -3779,12 +3749,10 @@ scanner_search(ScannerObject* self, PyObject *unused) state->ptr = state->start; - if (state->charsize == 1) { + if (state->logical_charsize == 1) { status = sre_search(state, PatternObject_GetCode(self->pattern)); } else { -#if defined(HAVE_UNICODE) status = sre_usearch(state, PatternObject_GetCode(self->pattern)); -#endif } if (PyErr_Occurred()) return NULL; diff --git a/Modules/_testcapimodule.c b/Modules/_testcapimodule.c index 456a8a59..2e36c73 100644 --- a/Modules/_testcapimodule.c +++ b/Modules/_testcapimodule.c @@ -1355,7 +1355,7 @@ static PyObject * test_Z_code(PyObject *self) { PyObject *tuple, *obj; - Py_UNICODE *value1, *value2; + const Py_UNICODE *value1, *value2; Py_ssize_t len1, len2; tuple = PyTuple_New(2); diff --git a/Modules/_tkinter.c b/Modules/_tkinter.c index 6879975..2f83165 100644 --- a/Modules/_tkinter.c +++ b/Modules/_tkinter.c @@ -80,18 +80,6 @@ Copyright (C) 1994 Steen Lumholt. #error "Tk older than 8.3.1 not supported" #endif -/* Unicode conversion assumes that Tcl_UniChar is two bytes. - We cannot test this directly, so we test UTF-8 size instead, - expecting that TCL_UTF_MAX is changed if Tcl ever supports - either UTF-16 or UCS-4. - Redhat 8 sets TCL_UTF_MAX to 6, and uses wchar_t for - Tcl_Unichar. This is also ok as long as Python uses UCS-4, - as well. -*/ -#if TCL_UTF_MAX != 3 && !(defined(Py_UNICODE_WIDE) && TCL_UTF_MAX==6) -#error "unsupported Tcl configuration" -#endif - #if !(defined(MS_WINDOWS) || defined(__CYGWIN__)) #define HAVE_CREATEFILEHANDLER #endif @@ -975,38 +963,44 @@ AsObj(PyObject *value) return result; } else if (PyUnicode_Check(value)) { - Py_UNICODE *inbuf = PyUnicode_AS_UNICODE(value); - Py_ssize_t size = PyUnicode_GET_SIZE(value); - /* This #ifdef assumes that Tcl uses UCS-2. - See TCL_UTF_MAX test above. */ -#if defined(Py_UNICODE_WIDE) && TCL_UTF_MAX == 3 + void *inbuf; + Py_ssize_t size; + int kind; Tcl_UniChar *outbuf = NULL; Py_ssize_t i; - size_t allocsize = ((size_t)size) * sizeof(Tcl_UniChar); - if (allocsize >= size) - outbuf = (Tcl_UniChar*)ckalloc(allocsize); + size_t allocsize; + + if (PyUnicode_READY(value) == -1) + return NULL; + + inbuf = PyUnicode_DATA(value); + size = PyUnicode_GET_LENGTH(value); + kind = PyUnicode_KIND(value); + allocsize = ((size_t)size) * sizeof(Tcl_UniChar); + outbuf = (Tcl_UniChar*)ckalloc(allocsize); /* Else overflow occurred, and we take the next exit */ if (!outbuf) { PyErr_NoMemory(); return NULL; } for (i = 0; i < size; i++) { - if (inbuf[i] >= 0x10000) { + Py_UCS4 ch = PyUnicode_READ(kind, inbuf, i); + /* We cannot test for sizeof(Tcl_UniChar) directly, + so we test for UTF-8 size instead. */ +#if TCL_UTF_MAX == 3 + if (ch >= 0x10000) { /* Tcl doesn't do UTF-16, yet. */ PyErr_SetString(PyExc_ValueError, "unsupported character"); ckfree(FREECAST outbuf); return NULL; +#endif } - outbuf[i] = inbuf[i]; + outbuf[i] = ch; } result = Tcl_NewUnicodeObj(outbuf, size); ckfree(FREECAST outbuf); return result; -#else - return Tcl_NewUnicodeObj(inbuf, size); -#endif - } else if(PyTclObject_Check(value)) { Tcl_Obj *v = ((PyTclObject*)value)->value; @@ -1088,24 +1082,14 @@ FromObj(PyObject* tkapp, Tcl_Obj *value) } if (value->typePtr == app->StringType) { -#if defined(Py_UNICODE_WIDE) && TCL_UTF_MAX==3 - PyObject *result; - int size; - Tcl_UniChar *input; - Py_UNICODE *output; - - size = Tcl_GetCharLength(value); - result = PyUnicode_FromUnicode(NULL, size); - if (!result) - return NULL; - input = Tcl_GetUnicode(value); - output = PyUnicode_AS_UNICODE(result); - while (size--) - *output++ = *input++; - return result; +#if TCL_UTF_MAX==3 + return PyUnicode_FromKindAndData( + PyUnicode_2BYTE_KIND, Tcl_GetUnicode(value), + Tcl_GetCharLength(value)); #else - return PyUnicode_FromUnicode(Tcl_GetUnicode(value), - Tcl_GetCharLength(value)); + return PyUnicode_FromKindAndData( + PyUnicode_4BYTE_KIND, Tcl_GetUnicode(value), + Tcl_GetCharLength(value)); #endif } diff --git a/Modules/arraymodule.c b/Modules/arraymodule.c index 8806bd5..9d49a97 100644 --- a/Modules/arraymodule.c +++ b/Modules/arraymodule.c @@ -2810,9 +2810,9 @@ PyMODINIT_FUNC PyInit_array(void) { PyObject *m; + char buffer[PY_ARRAY_LENGTH(descriptors)], *p; PyObject *typecodes; Py_ssize_t size = 0; - register Py_UNICODE *p; struct arraydescr *descr; if (PyType_Ready(&Arraytype) < 0) @@ -2831,13 +2831,13 @@ PyInit_array(void) size++; } - typecodes = PyUnicode_FromStringAndSize(NULL, size); - p = PyUnicode_AS_UNICODE(typecodes); + p = buffer; for (descr = descriptors; descr->typecode != '\0'; descr++) { *p++ = (char)descr->typecode; } + typecodes = PyUnicode_DecodeASCII(buffer, p - buffer, NULL); - PyModule_AddObject(m, "typecodes", (PyObject *)typecodes); + PyModule_AddObject(m, "typecodes", typecodes); if (PyErr_Occurred()) { Py_DECREF(m); diff --git a/Modules/md5module.c b/Modules/md5module.c index de43f1c..b6ab5ca 100644 --- a/Modules/md5module.c +++ b/Modules/md5module.c @@ -376,7 +376,7 @@ MD5_hexdigest(MD5object *self, PyObject *unused) unsigned char digest[MD5_DIGESTSIZE]; struct md5_state temp; PyObject *retval; - Py_UNICODE *hex_digest; + Py_UCS1 *hex_digest; int i, j; /* Get the raw (binary) digest value */ @@ -384,14 +384,10 @@ MD5_hexdigest(MD5object *self, PyObject *unused) md5_done(&temp, digest); /* Create a new string */ - retval = PyUnicode_FromStringAndSize(NULL, MD5_DIGESTSIZE * 2); + retval = PyUnicode_New(MD5_DIGESTSIZE * 2, 127); if (!retval) return NULL; - hex_digest = PyUnicode_AS_UNICODE(retval); - if (!hex_digest) { - Py_DECREF(retval); - return NULL; - } + hex_digest = PyUnicode_1BYTE_DATA(retval); /* Make hex version of the digest */ for(i=j=0; i<MD5_DIGESTSIZE; i++) { diff --git a/Modules/operator.c b/Modules/operator.c index 866ec3a..2f47573 100644 --- a/Modules/operator.c +++ b/Modules/operator.c @@ -402,7 +402,8 @@ attrgetter_new(PyTypeObject *type, PyObject *args, PyObject *kwds) for (idx = 0; idx < nattrs; ++idx) { PyObject *item = PyTuple_GET_ITEM(args, idx); Py_ssize_t item_len; - Py_UNICODE *item_buffer; + void *data; + unsigned int kind; int dot_count; if (!PyUnicode_Check(item)) { @@ -411,13 +412,18 @@ attrgetter_new(PyTypeObject *type, PyObject *args, PyObject *kwds) Py_DECREF(attr); return NULL; } - item_len = PyUnicode_GET_SIZE(item); - item_buffer = PyUnicode_AS_UNICODE(item); + if (PyUnicode_READY(item)) { + Py_DECREF(attr); + return NULL; + } + item_len = PyUnicode_GET_LENGTH(item); + kind = PyUnicode_KIND(item); + data = PyUnicode_DATA(item); /* check whethere the string is dotted */ dot_count = 0; for (char_idx = 0; char_idx < item_len; ++char_idx) { - if (item_buffer[char_idx] == (Py_UNICODE)'.') + if (PyUnicode_READ(kind, data, char_idx) == '.') ++dot_count; } @@ -438,12 +444,12 @@ attrgetter_new(PyTypeObject *type, PyObject *args, PyObject *kwds) } for (; dot_count > 0; --dot_count) { - while (item_buffer[unibuff_till] != (Py_UNICODE)'.') { + while (PyUnicode_READ(kind, data, unibuff_till) != '.') { ++unibuff_till; } - attr_chain_item = PyUnicode_FromUnicode( - item_buffer + unibuff_from, - unibuff_till - unibuff_from); + attr_chain_item = PyUnicode_Substring(item, + unibuff_from, + unibuff_till); if (attr_chain_item == NULL) { Py_DECREF(attr_chain); Py_DECREF(attr); @@ -456,9 +462,8 @@ attrgetter_new(PyTypeObject *type, PyObject *args, PyObject *kwds) } /* now add the last dotless name */ - attr_chain_item = PyUnicode_FromUnicode( - item_buffer + unibuff_from, - item_len - unibuff_from); + attr_chain_item = PyUnicode_Substring(item, + unibuff_from, item_len); if (attr_chain_item == NULL) { Py_DECREF(attr_chain); Py_DECREF(attr); diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c index d923eeb..bcd58d2 100644 --- a/Modules/pyexpat.c +++ b/Modules/pyexpat.c @@ -1102,17 +1102,22 @@ PyUnknownEncodingHandler(void *encodingHandlerData, PyUnicodeObject *_u_string = NULL; int result = 0; int i; + int kind; + void *data; /* Yes, supports only 8bit encodings */ _u_string = (PyUnicodeObject *) PyUnicode_Decode(template_buffer, 256, name, "replace"); - if (_u_string == NULL) + if (_u_string == NULL || PyUnicode_READY(_u_string) == -1) return result; + kind = PyUnicode_KIND(_u_string); + data = PyUnicode_DATA(_u_string); + for (i = 0; i < 256; i++) { /* Stupid to access directly, but fast */ - Py_UNICODE c = _u_string->str[i]; + Py_UCS4 c = PyUnicode_READ(kind, data, i); if (c == Py_UNICODE_REPLACEMENT_CHARACTER) info->map[i] = -1; else @@ -1229,7 +1234,7 @@ get_pybool(int istrue) static PyObject * xmlparse_getattro(xmlparseobject *self, PyObject *nameobj) { - Py_UNICODE *name; + const Py_UNICODE *name; int handlernum = -1; if (!PyUnicode_Check(nameobj)) diff --git a/Modules/sha1module.c b/Modules/sha1module.c index 1cace54..d25aaea 100644 --- a/Modules/sha1module.c +++ b/Modules/sha1module.c @@ -352,7 +352,7 @@ SHA1_hexdigest(SHA1object *self, PyObject *unused) unsigned char digest[SHA1_DIGESTSIZE]; struct sha1_state temp; PyObject *retval; - Py_UNICODE *hex_digest; + Py_UCS1 *hex_digest; int i, j; /* Get the raw (binary) digest value */ @@ -360,14 +360,10 @@ SHA1_hexdigest(SHA1object *self, PyObject *unused) sha1_done(&temp, digest); /* Create a new string */ - retval = PyUnicode_FromStringAndSize(NULL, SHA1_DIGESTSIZE * 2); + retval = PyUnicode_New(SHA1_DIGESTSIZE * 2, 127); if (!retval) return NULL; - hex_digest = PyUnicode_AS_UNICODE(retval); - if (!hex_digest) { - Py_DECREF(retval); - return NULL; - } + hex_digest = PyUnicode_1BYTE_DATA(retval); /* Make hex version of the digest */ for(i=j=0; i<SHA1_DIGESTSIZE; i++) { diff --git a/Modules/sha256module.c b/Modules/sha256module.c index 8617210..fe2dcfa 100644 --- a/Modules/sha256module.c +++ b/Modules/sha256module.c @@ -445,7 +445,7 @@ SHA256_hexdigest(SHAobject *self, PyObject *unused) unsigned char digest[SHA_DIGESTSIZE]; SHAobject temp; PyObject *retval; - Py_UNICODE *hex_digest; + Py_UCS1 *hex_digest; int i, j; /* Get the raw (binary) digest value */ @@ -453,14 +453,10 @@ SHA256_hexdigest(SHAobject *self, PyObject *unused) sha_final(digest, &temp); /* Create a new string */ - retval = PyUnicode_FromStringAndSize(NULL, self->digestsize * 2); + retval = PyUnicode_New(self->digestsize * 2, 127); if (!retval) return NULL; - hex_digest = PyUnicode_AS_UNICODE(retval); - if (!hex_digest) { - Py_DECREF(retval); - return NULL; - } + hex_digest = PyUnicode_1BYTE_DATA(retval); /* Make hex version of the digest */ for(i=j=0; i<self->digestsize; i++) { diff --git a/Modules/sha512module.c b/Modules/sha512module.c index 0faaf5c..4177000 100644 --- a/Modules/sha512module.c +++ b/Modules/sha512module.c @@ -511,7 +511,7 @@ SHA512_hexdigest(SHAobject *self, PyObject *unused) unsigned char digest[SHA_DIGESTSIZE]; SHAobject temp; PyObject *retval; - Py_UNICODE *hex_digest; + Py_UCS1 *hex_digest; int i, j; /* Get the raw (binary) digest value */ @@ -519,14 +519,10 @@ SHA512_hexdigest(SHAobject *self, PyObject *unused) sha512_final(digest, &temp); /* Create a new string */ - retval = PyUnicode_FromStringAndSize(NULL, self->digestsize * 2); + retval = PyUnicode_New(self->digestsize * 2, 127); if (!retval) return NULL; - hex_digest = PyUnicode_AS_UNICODE(retval); - if (!hex_digest) { - Py_DECREF(retval); - return NULL; - } + hex_digest = PyUnicode_1BYTE_DATA(retval); /* Make hex version of the digest */ for (i=j=0; i<self->digestsize; i++) { diff --git a/Modules/sre.h b/Modules/sre.h index 518c11d..0d91f25 100644 --- a/Modules/sre.h +++ b/Modules/sre.h @@ -30,7 +30,8 @@ typedef struct { PyObject* pattern; /* pattern source (or None) */ int flags; /* flags used when compiling pattern source */ PyObject *weakreflist; /* List of weak references */ - int charsize; /* pattern charsize (or -1) */ + int logical_charsize; /* pattern charsize (or -1) */ + int charsize; /* pattern code */ Py_ssize_t codesize; SRE_CODE code[1]; @@ -71,6 +72,7 @@ typedef struct { PyObject* string; Py_ssize_t pos, endpos; /* character size */ + int logical_charsize; /* kind of thing: 1 - bytes, 2/4 - unicode */ int charsize; /* registers */ Py_ssize_t lastindex; diff --git a/Modules/syslogmodule.c b/Modules/syslogmodule.c index 5b86963..f6dadf4 100644 --- a/Modules/syslogmodule.c +++ b/Modules/syslogmodule.c @@ -70,7 +70,7 @@ syslog_get_argv(void) Py_ssize_t argv_len, scriptlen; PyObject *scriptobj; - Py_UNICODE *atslash, *atstart; + Py_ssize_t slash; PyObject *argv = PySys_GetObject("argv"); if (argv == NULL) { @@ -95,11 +95,13 @@ syslog_get_argv(void) return(NULL); } - atstart = PyUnicode_AS_UNICODE(scriptobj); - atslash = Py_UNICODE_strrchr(atstart, SEP); - if (atslash) { - return(PyUnicode_FromUnicode(atslash + 1, - scriptlen - (atslash - atstart) - 1)); + slash = PyUnicode_FindChar(scriptobj, SEP, + 0, PyUnicode_GET_LENGTH(scriptobj), -1); + if (slash == -2) + return NULL; + if (slash != -1) { + return PyUnicode_Substring(scriptobj, slash, + PyUnicode_GET_LENGTH(scriptobj)); } else { Py_INCREF(scriptobj); return(scriptobj); diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 463be2c..d917f91 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -92,16 +92,13 @@ new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4) static Py_UCS4 getuchar(PyUnicodeObject *obj) { - Py_UNICODE *v = PyUnicode_AS_UNICODE(obj); - - if (PyUnicode_GET_SIZE(obj) == 1) - return *v; -#ifndef Py_UNICODE_WIDE - else if ((PyUnicode_GET_SIZE(obj) == 2) && - (0xD800 <= v[0] && v[0] <= 0xDBFF) && - (0xDC00 <= v[1] && v[1] <= 0xDFFF)) - return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000; -#endif + if (PyUnicode_READY(obj)) + return (Py_UCS4)-1; + if (PyUnicode_GET_LENGTH(obj) == 1) { + if (PyUnicode_READY(obj)) + return (Py_UCS4)-1; + return PyUnicode_READ_CHAR(obj, 0); + } PyErr_SetString(PyExc_TypeError, "need a single Unicode character as parameter"); return (Py_UCS4)-1; @@ -1142,7 +1139,6 @@ static PyObject * unicodedata_lookup(PyObject* self, PyObject* args) { Py_UCS4 code; - Py_UNICODE str[2]; char* name; int namelen; @@ -1155,15 +1151,7 @@ unicodedata_lookup(PyObject* self, PyObject* args) return NULL; } -#ifndef Py_UNICODE_WIDE - if (code >= 0x10000) { - str[0] = 0xd800 + ((code - 0x10000) >> 10); - str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff); - return PyUnicode_FromUnicode(str, 2); - } -#endif - str[0] = (Py_UNICODE) code; - return PyUnicode_FromUnicode(str, 1); + return PyUnicode_FromOrdinal(code); } /* XXX Add doc strings. */ diff --git a/Modules/zipimport.c b/Modules/zipimport.c index a83bf8b..6bf8359 100644 --- a/Modules/zipimport.c +++ b/Modules/zipimport.c @@ -64,7 +64,7 @@ static int zipimporter_init(ZipImporter *self, PyObject *args, PyObject *kwds) { PyObject *pathobj, *files; - Py_UNICODE *path, *p, *prefix, buf[MAXPATHLEN+2]; + Py_UCS4 *path, *p, *prefix, buf[MAXPATHLEN+2]; Py_ssize_t len; if (!_PyArg_NoKeywords("zipimporter()", kwds)) @@ -74,8 +74,11 @@ zipimporter_init(ZipImporter *self, PyObject *args, PyObject *kwds) PyUnicode_FSDecoder, &pathobj)) return -1; + if (PyUnicode_READY(pathobj) == -1) + return -1; + /* copy path to buf */ - len = PyUnicode_GET_SIZE(pathobj); + len = PyUnicode_GET_LENGTH(pathobj); if (len == 0) { PyErr_SetString(ZipImportError, "archive path is empty"); goto error; @@ -85,7 +88,8 @@ zipimporter_init(ZipImporter *self, PyObject *args, PyObject *kwds) "archive path too long"); goto error; } - Py_UNICODE_strcpy(buf, PyUnicode_AS_UNICODE(pathobj)); + if (!PyUnicode_AsUCS4(pathobj, buf, PY_ARRAY_LENGTH(buf), 1)) + goto error; #ifdef ALTSEP for (p = buf; *p; p++) { @@ -101,7 +105,8 @@ zipimporter_init(ZipImporter *self, PyObject *args, PyObject *kwds) int rv; if (pathobj == NULL) { - pathobj = PyUnicode_FromUnicode(buf, len); + pathobj = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, + buf, len); if (pathobj == NULL) goto error; } @@ -116,7 +121,7 @@ zipimporter_init(ZipImporter *self, PyObject *args, PyObject *kwds) else if (PyErr_Occurred()) goto error; /* back up one path element */ - p = Py_UNICODE_strrchr(buf, SEP); + p = Py_UCS4_strrchr(buf, SEP); if (prefix != NULL) *prefix = SEP; if (p == NULL) @@ -148,7 +153,7 @@ zipimporter_init(ZipImporter *self, PyObject *args, PyObject *kwds) if (prefix != NULL) { prefix++; - len = Py_UNICODE_strlen(prefix); + len = Py_UCS4_strlen(prefix); if (prefix[len-1] != SEP) { /* add trailing SEP */ prefix[len] = SEP; @@ -158,7 +163,8 @@ zipimporter_init(ZipImporter *self, PyObject *args, PyObject *kwds) } else len = 0; - self->prefix = PyUnicode_FromUnicode(prefix, len); + self->prefix = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, + prefix, len); if (self->prefix == NULL) goto error; @@ -193,7 +199,7 @@ zipimporter_repr(ZipImporter *self) { if (self->archive == NULL) return PyUnicode_FromString("<zipimporter object \"???\">"); - else if (self->prefix != NULL && PyUnicode_GET_SIZE(self->prefix) != 0) + else if (self->prefix != NULL && PyUnicode_GET_LENGTH(self->prefix) != 0) return PyUnicode_FromFormat("<zipimporter object \"%U%c%U\">", self->archive, SEP, self->prefix); else @@ -206,16 +212,24 @@ static PyObject * get_subname(PyObject *fullname) { Py_ssize_t len; - Py_UNICODE *subname; - subname = Py_UNICODE_strrchr(PyUnicode_AS_UNICODE(fullname), '.'); + Py_UCS4 *subname, *fullname_ucs4; + fullname_ucs4 = PyUnicode_AsUCS4Copy(fullname); + if (!fullname_ucs4) + return NULL; + subname = Py_UCS4_strrchr(fullname_ucs4, '.'); if (subname == NULL) { + PyMem_Free(fullname_ucs4); Py_INCREF(fullname); return fullname; } else { + PyObject *result; subname++; - len = PyUnicode_GET_SIZE(fullname); - len -= subname - PyUnicode_AS_UNICODE(fullname); - return PyUnicode_FromUnicode(subname, len); + len = PyUnicode_GET_LENGTH(fullname); + len -= subname - fullname_ucs4; + result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, + subname, len); + PyMem_Free(fullname_ucs4); + return result; } } @@ -228,23 +242,29 @@ static PyObject* make_filename(PyObject *prefix, PyObject *name) { PyObject *pathobj; - Py_UNICODE *p; + Py_UCS4 *p, *buf; + Py_ssize_t len; - pathobj = PyUnicode_FromUnicode(NULL, - PyUnicode_GET_SIZE(prefix) - + PyUnicode_GET_SIZE(name)); - if (pathobj == NULL) + len = PyUnicode_GET_LENGTH(prefix) + PyUnicode_GET_LENGTH(name) + 1; + p = buf = PyMem_Malloc(sizeof(Py_UCS4) * len); + if (buf == NULL) { + PyErr_NoMemory(); return NULL; + } - p = PyUnicode_AS_UNICODE(pathobj); - - Py_UNICODE_strcpy(p, PyUnicode_AS_UNICODE(prefix)); - p += PyUnicode_GET_SIZE(prefix); - Py_UNICODE_strcpy(p, PyUnicode_AS_UNICODE(name)); + if (!PyUnicode_AsUCS4(prefix, p, len, 0)) + return NULL; + p += PyUnicode_GET_LENGTH(prefix); + len -= PyUnicode_GET_LENGTH(prefix); + if (!PyUnicode_AsUCS4(name, p, len, 1)) + return NULL; for (; *p; p++) { if (*p == '.') *p = SEP; } + pathobj = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, + buf, p-buf); + PyMem_Free(buf); return pathobj; } @@ -330,6 +350,8 @@ zipimporter_load_module(PyObject *obj, PyObject *args) if (!PyArg_ParseTuple(args, "U:zipimporter.load_module", &fullname)) return NULL; + if (PyUnicode_READY(fullname) == -1) + return NULL; code = get_module_code(self, fullname, &ispackage, &modpath); if (code == NULL) @@ -426,46 +448,53 @@ zipimporter_is_package(PyObject *obj, PyObject *args) return PyBool_FromLong(mi == MI_PACKAGE); } + static PyObject * zipimporter_get_data(PyObject *obj, PyObject *args) { ZipImporter *self = (ZipImporter *)obj; PyObject *pathobj, *key; - const Py_UNICODE *path; + const Py_UCS4 *path; #ifdef ALTSEP - Py_UNICODE *p, buf[MAXPATHLEN + 1]; + Py_UCS4 *p; #endif - Py_UNICODE *archive; PyObject *toc_entry; Py_ssize_t path_len, len; + Py_UCS4 buf[MAXPATHLEN + 1], archive[MAXPATHLEN + 1]; if (!PyArg_ParseTuple(args, "U:zipimporter.get_data", &pathobj)) return NULL; - path_len = PyUnicode_GET_SIZE(pathobj); - path = PyUnicode_AS_UNICODE(pathobj); -#ifdef ALTSEP + if (PyUnicode_READY(pathobj) == -1) + return NULL; + + path_len = PyUnicode_GET_LENGTH(pathobj); if (path_len >= MAXPATHLEN) { PyErr_SetString(ZipImportError, "path too long"); return NULL; } - Py_UNICODE_strcpy(buf, path); + if (!PyUnicode_AsUCS4(pathobj, buf, PY_ARRAY_LENGTH(buf), 1)) + return NULL; + path = buf; +#ifdef ALTSEP for (p = buf; *p; p++) { if (*p == ALTSEP) *p = SEP; } - path = buf; #endif - archive = PyUnicode_AS_UNICODE(self->archive); - len = PyUnicode_GET_SIZE(self->archive); - if ((size_t)len < Py_UNICODE_strlen(path) && - Py_UNICODE_strncmp(path, archive, len) == 0 && - path[len] == SEP) { - path += len + 1; - path_len -= len + 1; + len = PyUnicode_GET_LENGTH(self->archive); + if ((size_t)len < Py_UCS4_strlen(path)) { + if (!PyUnicode_AsUCS4(self->archive, archive, PY_ARRAY_LENGTH(archive), 1)) + return NULL; + if (Py_UCS4_strncmp(path, archive, len) == 0 && + path[len] == SEP) { + path += len + 1; + path_len -= len + 1; + } } - key = PyUnicode_FromUnicode(path, path_len); + key = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, + path, path_len); if (key == NULL) return NULL; toc_entry = PyDict_GetItem(self->files, key); @@ -725,9 +754,10 @@ read_directory(PyObject *archive) unsigned short flags; long compress, crc, data_size, file_size, file_offset, date, time; long header_offset, name_size, header_size, header_position; - long i, l, count; + long l, count; + Py_ssize_t i; size_t length; - Py_UNICODE path[MAXPATHLEN + 5]; + Py_UCS4 path[MAXPATHLEN + 5]; char name[MAXPATHLEN + 5]; PyObject *nameobj = NULL; char *p, endof_central_dir[22]; @@ -736,12 +766,13 @@ read_directory(PyObject *archive) const char *charset; int bootstrap; - if (PyUnicode_GET_SIZE(archive) > MAXPATHLEN) { + if (PyUnicode_GET_LENGTH(archive) > MAXPATHLEN) { PyErr_SetString(PyExc_OverflowError, "Zip path name is too long"); return NULL; } - Py_UNICODE_strcpy(path, PyUnicode_AS_UNICODE(archive)); + if (!PyUnicode_AsUCS4(archive, path, PY_ARRAY_LENGTH(path), 1)) + return NULL; fp = _Py_fopen(archive, "rb"); if (fp == NULL) { @@ -771,7 +802,7 @@ read_directory(PyObject *archive) if (files == NULL) goto error; - length = Py_UNICODE_strlen(path); + length = Py_UCS4_strlen(path); path[length] = SEP; /* Start of Central Directory */ @@ -802,7 +833,7 @@ read_directory(PyObject *archive) name_size = MAXPATHLEN; p = name; - for (i = 0; i < name_size; i++) { + for (i = 0; i < (Py_ssize_t)name_size; i++) { *p = (char)getc(fp); if (*p == '/') *p = SEP; @@ -827,6 +858,8 @@ read_directory(PyObject *archive) else charset = "cp437"; nameobj = PyUnicode_Decode(name, name_size, charset, NULL); + if (PyUnicode_READY(nameobj) == -1) + goto error; if (nameobj == NULL) { if (bootstrap) PyErr_Format(PyExc_NotImplementedError, @@ -835,11 +868,12 @@ read_directory(PyObject *archive) PY_MAJOR_VERSION, PY_MINOR_VERSION); goto error; } - Py_UNICODE_strncpy(path + length + 1, - PyUnicode_AS_UNICODE(nameobj), - MAXPATHLEN - length - 1); - - pathobj = PyUnicode_FromUnicode(path, Py_UNICODE_strlen(path)); + for (i = 0; (i < MAXPATHLEN - length - 1) && + (i < PyUnicode_GET_LENGTH(nameobj)); i++) + path[length + 1 + i] = PyUnicode_READ_CHAR(nameobj, i); + path[length + 1 + i] = 0; + pathobj = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, + path, Py_UCS4_strlen(path)); if (pathobj == NULL) goto error; t = Py_BuildValue("Niiiiiii", pathobj, compress, data_size, @@ -1148,8 +1182,11 @@ get_mtime_of_source(ZipImporter *self, PyObject *path) time_t mtime; /* strip 'c' or 'o' from *.py[co] */ - stripped = PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(path), - PyUnicode_GET_SIZE(path) - 1); + if (PyUnicode_READY(path) == -1) + return (time_t)-1; + stripped = PyUnicode_FromKindAndData(PyUnicode_KIND(path), + PyUnicode_DATA(path), + PyUnicode_GET_LENGTH(path) - 1); if (stripped == NULL) return (time_t)-1; |