summaryrefslogtreecommitdiffstats
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c1129
1 files changed, 533 insertions, 596 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 884eaef..1eaf2e9 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -290,6 +290,8 @@ static unsigned char ascii_linebreak[] = {
0, 0, 0, 0, 0, 0, 0, 0
};
+#include "clinic/unicodeobject.c.h"
+
/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
This function is kept for backward compatibility with the old API. */
Py_UNICODE
@@ -519,10 +521,6 @@ unicode_result_unchanged(PyObject *unicode)
return _PyUnicode_Copy(unicode);
}
-#ifdef HAVE_MBCS
-static OSVERSIONINFOEX winver;
-#endif
-
/* --- Bloom Filters ----------------------------------------------------- */
/* stuff to implement simple "bloom filters" for Unicode characters.
@@ -645,7 +643,7 @@ make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
static PyObject *
fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
-Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
+Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
Py_ssize_t size, Py_UCS4 ch,
int direction)
{
@@ -727,7 +725,7 @@ resize_compact(PyObject *unicode, Py_ssize_t length)
_Py_DEC_REFTOTAL;
_Py_ForgetReference(unicode);
- new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
+ new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
if (new_unicode == NULL) {
_Py_NewReference(unicode);
PyErr_NoMemory();
@@ -816,7 +814,7 @@ resize_inplace(PyObject *unicode, Py_ssize_t length)
assert(_PyUnicode_WSTR(unicode) != NULL);
/* check for integer overflow */
- if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
+ if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
PyErr_NoMemory();
return -1;
}
@@ -888,7 +886,7 @@ _PyUnicode_New(Py_ssize_t length)
}
/* Ensure we won't overflow the size. */
- if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
+ if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
return (PyUnicodeObject *)PyErr_NoMemory();
}
if (length < 0) {
@@ -2313,35 +2311,6 @@ PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
#endif /* HAVE_WCHAR_H */
-static void
-makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
- char c)
-{
- *fmt++ = '%';
- if (longflag)
- *fmt++ = 'l';
- else if (longlongflag) {
- /* longlongflag should only ever be nonzero on machines with
- HAVE_LONG_LONG defined */
-#ifdef HAVE_LONG_LONG
- char *f = PY_FORMAT_LONG_LONG;
- while (*f)
- *fmt++ = *f++;
-#else
- /* we shouldn't ever get here */
- assert(0);
- *fmt++ = 'l';
-#endif
- }
- else if (size_tflag) {
- char *f = PY_FORMAT_SIZE_T;
- while (*f)
- *fmt++ = *f++;
- }
- *fmt++ = c;
- *fmt = '\0';
-}
-
/* maximum number of characters required for output of %lld or %p.
We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
plus 1 for the sign. 53/22 is an upper bound for log10(256). */
@@ -2517,48 +2486,42 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
case 'x':
{
/* used by sprintf */
- char fmt[10]; /* should be enough for "%0lld\0" */
char buffer[MAX_LONG_LONG_CHARS];
Py_ssize_t arglen;
if (*f == 'u') {
- makefmt(fmt, longflag, longlongflag, size_tflag, *f);
-
if (longflag)
- len = sprintf(buffer, fmt,
+ len = sprintf(buffer, "%lu",
va_arg(*vargs, unsigned long));
#ifdef HAVE_LONG_LONG
else if (longlongflag)
- len = sprintf(buffer, fmt,
+ len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
va_arg(*vargs, unsigned PY_LONG_LONG));
#endif
else if (size_tflag)
- len = sprintf(buffer, fmt,
+ len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
va_arg(*vargs, size_t));
else
- len = sprintf(buffer, fmt,
+ len = sprintf(buffer, "%u",
va_arg(*vargs, unsigned int));
}
else if (*f == 'x') {
- makefmt(fmt, 0, 0, 0, 'x');
- len = sprintf(buffer, fmt, va_arg(*vargs, int));
+ len = sprintf(buffer, "%x", va_arg(*vargs, int));
}
else {
- makefmt(fmt, longflag, longlongflag, size_tflag, *f);
-
if (longflag)
- len = sprintf(buffer, fmt,
+ len = sprintf(buffer, "%li",
va_arg(*vargs, long));
#ifdef HAVE_LONG_LONG
else if (longlongflag)
- len = sprintf(buffer, fmt,
+ len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
va_arg(*vargs, PY_LONG_LONG));
#endif
else if (size_tflag)
- len = sprintf(buffer, fmt,
+ len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
va_arg(*vargs, Py_ssize_t));
else
- len = sprintf(buffer, fmt,
+ len = sprintf(buffer, "%i",
va_arg(*vargs, int));
}
assert(len >= 0);
@@ -3241,7 +3204,7 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
wlen2 = wcslen(wstr);
if (wlen2 != wlen) {
PyMem_Free(wstr);
- PyErr_SetString(PyExc_TypeError, "embedded null character");
+ PyErr_SetString(PyExc_ValueError, "embedded null character");
return NULL;
}
@@ -3249,7 +3212,7 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
/* "surrogateescape" error handler */
char *str;
- str = _Py_wchar2char(wstr, &error_pos);
+ str = Py_EncodeLocale(wstr, &error_pos);
if (str == NULL) {
if (error_pos == (size_t)-1) {
PyErr_NoMemory();
@@ -3302,7 +3265,7 @@ encode_error:
if (errmsg != NULL) {
size_t errlen;
- wstr = _Py_char2wchar(errmsg, &errlen);
+ wstr = Py_DecodeLocale(errmsg, &errlen);
if (wstr != NULL) {
reason = PyUnicode_FromWideChar(wstr, errlen);
PyMem_RawFree(wstr);
@@ -3477,7 +3440,7 @@ mbstowcs_errorpos(const char *str, size_t len)
memset(&mbs, 0, sizeof mbs);
while (len)
{
- converted = mbrtowc(&ch, (char*)str, len, &mbs);
+ converted = mbrtowc(&ch, str, len, &mbs);
if (converted == 0)
/* Reached end of string */
break;
@@ -3508,19 +3471,20 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
int surrogateescape;
size_t error_pos;
char *errmsg;
- PyObject *reason, *exc;
+ PyObject *reason = NULL; /* initialize to prevent gcc warning */
+ PyObject *exc;
if (locale_error_handler(errors, &surrogateescape) < 0)
return NULL;
- if (str[len] != '\0' || len != strlen(str)) {
- PyErr_SetString(PyExc_TypeError, "embedded null character");
+ if (str[len] != '\0' || (size_t)len != strlen(str)) {
+ PyErr_SetString(PyExc_ValueError, "embedded null byte");
return NULL;
}
if (surrogateescape) {
/* "surrogateescape" error handler */
- wstr = _Py_char2wchar(str, &wlen);
+ wstr = Py_DecodeLocale(str, &wlen);
if (wstr == NULL) {
if (wlen == (size_t)-1)
PyErr_NoMemory();
@@ -3573,7 +3537,7 @@ decode_error:
error_pos = mbstowcs_errorpos(str, len);
if (errmsg != NULL) {
size_t errlen;
- wstr = _Py_char2wchar(errmsg, &errlen);
+ wstr = Py_DecodeLocale(errmsg, &errlen);
if (wstr != NULL) {
reason = PyUnicode_FromWideChar(wstr, errlen);
PyMem_RawFree(wstr);
@@ -3643,21 +3607,6 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
int
-_PyUnicode_HasNULChars(PyObject* str)
-{
- Py_ssize_t pos;
-
- if (PyUnicode_READY(str) == -1)
- return -1;
- pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
- PyUnicode_GET_LENGTH(str), '\0', 1);
- if (pos == -1)
- return 0;
- else
- return 1;
-}
-
-int
PyUnicode_FSConverter(PyObject* arg, void* addr)
{
PyObject *output = NULL;
@@ -3687,8 +3636,8 @@ PyUnicode_FSConverter(PyObject* arg, void* addr)
}
size = PyBytes_GET_SIZE(output);
data = PyBytes_AS_STRING(output);
- if (size != strlen(data)) {
- PyErr_SetString(PyExc_TypeError, "embedded NUL character");
+ if ((size_t)size != strlen(data)) {
+ PyErr_SetString(PyExc_ValueError, "embedded null byte");
Py_DECREF(output);
return 0;
}
@@ -3732,7 +3681,7 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr)
}
if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
- PyErr_SetString(PyExc_TypeError, "embedded NUL character");
+ PyErr_SetString(PyExc_ValueError, "embedded null character");
Py_DECREF(output);
return 0;
}
@@ -4837,7 +4786,7 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
/* Note: size will always be longer than the resulting Unicode
character count */
- if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
+ if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
return NULL;
unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
if (!unicode)
@@ -5102,32 +5051,22 @@ _PyUnicode_EncodeUTF32(PyObject *str,
const char *errors,
int byteorder)
{
- int kind;
- void *data;
+ enum PyUnicode_Kind kind;
+ const void *data;
Py_ssize_t len;
PyObject *v;
- unsigned char *p;
- Py_ssize_t nsize, i;
- /* Offsets from p for storing byte pairs in the right order. */
+ PY_UINT32_T *out;
#if PY_LITTLE_ENDIAN
- int iorder[] = {0, 1, 2, 3};
+ int native_ordering = byteorder <= 0;
#else
- int iorder[] = {3, 2, 1, 0};
+ int native_ordering = byteorder >= 0;
#endif
const char *encoding;
+ Py_ssize_t nsize, pos;
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
PyObject *rep = NULL;
-#define STORECHAR(CH) \
- do { \
- p[iorder[3]] = ((CH) >> 24) & 0xff; \
- p[iorder[2]] = ((CH) >> 16) & 0xff; \
- p[iorder[1]] = ((CH) >> 8) & 0xff; \
- p[iorder[0]] = (CH) & 0xff; \
- p += 4; \
- } while(0)
-
if (!PyUnicode_Check(str)) {
PyErr_BadArgument();
return NULL;
@@ -5138,59 +5077,53 @@ _PyUnicode_EncodeUTF32(PyObject *str,
data = PyUnicode_DATA(str);
len = PyUnicode_GET_LENGTH(str);
- nsize = len + (byteorder == 0);
- if (nsize > PY_SSIZE_T_MAX / 4)
+ if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
return PyErr_NoMemory();
+ nsize = len + (byteorder == 0);
v = PyBytes_FromStringAndSize(NULL, nsize * 4);
if (v == NULL)
return NULL;
- p = (unsigned char *)PyBytes_AS_STRING(v);
+ /* output buffer is 4-bytes aligned */
+ assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
+ out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
if (byteorder == 0)
- STORECHAR(0xFEFF);
+ *out++ = 0xFEFF;
if (len == 0)
- return v;
+ goto done;
- if (byteorder == -1) {
- /* force LE */
- iorder[0] = 0;
- iorder[1] = 1;
- iorder[2] = 2;
- iorder[3] = 3;
+ if (byteorder == -1)
encoding = "utf-32-le";
- }
- else if (byteorder == 1) {
- /* force BE */
- iorder[0] = 3;
- iorder[1] = 2;
- iorder[2] = 1;
- iorder[3] = 0;
+ else if (byteorder == 1)
encoding = "utf-32-be";
- }
else
encoding = "utf-32";
if (kind == PyUnicode_1BYTE_KIND) {
- for (i = 0; i < len; i++)
- STORECHAR(PyUnicode_READ(kind, data, i));
- return v;
+ ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
+ goto done;
}
- for (i = 0; i < len;) {
+ pos = 0;
+ while (pos < len) {
Py_ssize_t repsize, moreunits;
- Py_UCS4 ch = PyUnicode_READ(kind, data, i);
- i++;
- assert(ch <= MAX_UNICODE);
- if (!Py_UNICODE_IS_SURROGATE(ch)) {
- STORECHAR(ch);
- continue;
+
+ if (kind == PyUnicode_2BYTE_KIND) {
+ pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
+ &out, native_ordering);
}
+ else {
+ assert(kind == PyUnicode_4BYTE_KIND);
+ pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
+ &out, native_ordering);
+ }
+ if (pos == len)
+ break;
rep = unicode_encode_call_errorhandler(
errors, &errorHandler,
encoding, "surrogates not allowed",
- str, &exc, i-1, i, &i);
-
+ str, &exc, pos, pos + 1, &pos);
if (!rep)
goto error;
@@ -5198,7 +5131,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
repsize = PyBytes_GET_SIZE(rep);
if (repsize & 3) {
raise_encode_exception(&exc, encoding,
- str, i - 1, i,
+ str, pos - 1, pos,
"surrogates not allowed");
goto error;
}
@@ -5211,7 +5144,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
moreunits = repsize = PyUnicode_GET_LENGTH(rep);
if (!PyUnicode_IS_ASCII(rep)) {
raise_encode_exception(&exc, encoding,
- str, i - 1, i,
+ str, pos - 1, pos,
"surrogates not allowed");
goto error;
}
@@ -5219,7 +5152,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
/* four bytes are reserved for each surrogate */
if (moreunits > 1) {
- Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
+ Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
Py_ssize_t morebytes = 4 * (moreunits - 1);
if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
/* integer overflow */
@@ -5228,20 +5161,16 @@ _PyUnicode_EncodeUTF32(PyObject *str,
}
if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
goto error;
- p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
+ out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
}
if (PyBytes_Check(rep)) {
- Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
- p += repsize;
+ Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
+ out += moreunits;
} else /* rep is unicode */ {
- const Py_UCS1 *repdata;
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
- repdata = PyUnicode_1BYTE_DATA(rep);
- while (repsize--) {
- Py_UCS4 ch = *repdata++;
- STORECHAR(ch);
- }
+ ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
+ &out, native_ordering);
}
Py_CLEAR(rep);
@@ -5250,11 +5179,12 @@ _PyUnicode_EncodeUTF32(PyObject *str,
/* Cut back to size actually needed. This is necessary for, for example,
encoding of a string containing isolated surrogates and the 'ignore'
handler is used. */
- nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
+ nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
if (nsize != PyBytes_GET_SIZE(v))
_PyBytes_Resize(&v, nsize);
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
+ done:
return v;
error:
Py_XDECREF(rep);
@@ -5262,7 +5192,6 @@ _PyUnicode_EncodeUTF32(PyObject *str,
Py_XDECREF(exc);
Py_XDECREF(v);
return NULL;
-#undef STORECHAR
}
PyObject *
@@ -6849,28 +6778,6 @@ code_page_name(UINT code_page, PyObject **obj)
return PyBytes_AS_STRING(*obj);
}
-static int
-is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
-{
- const char *curr = s + offset;
- const char *prev;
-
- if (!IsDBCSLeadByteEx(code_page, *curr))
- return 0;
-
- prev = CharPrevExA(code_page, s, curr, 0);
- if (prev == curr)
- return 1;
- /* FIXME: This code is limited to "true" double-byte encodings,
- as it assumes an incomplete character consists of a single
- byte. */
- if (curr - prev == 2)
- return 1;
- if (!IsDBCSLeadByteEx(code_page, *prev))
- return 1;
- return 0;
-}
-
static DWORD
decode_code_page_flags(UINT code_page)
{
@@ -6945,7 +6852,7 @@ static int
decode_code_page_errors(UINT code_page,
PyObject **v,
const char *in, const int size,
- const char *errors)
+ const char *errors, int final)
{
const char *startin = in;
const char *endin = in + size;
@@ -6972,7 +6879,7 @@ decode_code_page_errors(UINT code_page,
if (encoding == NULL)
return -1;
- if (errors == NULL || strcmp(errors, "strict") == 0) {
+ if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
/* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
UnicodeDecodeError. */
make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
@@ -7035,6 +6942,10 @@ decode_code_page_errors(UINT code_page,
if (outsize <= 0) {
Py_ssize_t startinpos, endinpos, outpos;
+ /* last character in partial decode? */
+ if (in + insize >= endin && !final)
+ break;
+
startinpos = in - startin;
endinpos = startinpos + 1;
outpos = out - PyUnicode_AS_UNICODE(*v);
@@ -7063,7 +6974,8 @@ decode_code_page_errors(UINT code_page,
assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
if (unicode_resize(v, outsize) < 0)
goto error;
- ret = size;
+ /* (in - startin) <= size and size is an int */
+ ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
error:
Py_XDECREF(encoding_obj);
@@ -7104,24 +7016,19 @@ decode_code_page_stateful(int code_page,
done = 1;
}
- /* Skip trailing lead-byte unless 'final' is set */
- if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
- --chunk_size;
-
if (chunk_size == 0 && done) {
if (v != NULL)
break;
_Py_RETURN_UNICODE_EMPTY();
}
-
converted = decode_code_page_strict(code_page, &v,
s, chunk_size);
if (converted == -2)
converted = decode_code_page_errors(code_page, &v,
s, chunk_size,
- errors);
- assert(converted != 0);
+ errors, final);
+ assert(converted != 0 || done);
if (converted < 0) {
Py_XDECREF(v);
@@ -7169,13 +7076,7 @@ static DWORD
encode_code_page_flags(UINT code_page, const char *errors)
{
if (code_page == CP_UTF8) {
- if (winver.dwMajorVersion >= 6)
- /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
- and later */
- return WC_ERR_INVALID_CHARS;
- else
- /* CP_UTF8 only supports flags=0 on Windows older than Vista */
- return 0;
+ return WC_ERR_INVALID_CHARS;
}
else if (code_page == CP_UTF7) {
/* CP_UTF7 only supports flags=0 */
@@ -7485,6 +7386,11 @@ encode_code_page(int code_page,
Py_ssize_t offset;
int chunk_len, ret, done;
+ if (!PyUnicode_Check(unicode)) {
+ PyErr_BadArgument();
+ return NULL;
+ }
+
if (PyUnicode_READY(unicode) == -1)
return NULL;
len = PyUnicode_GET_LENGTH(unicode);
@@ -7558,10 +7464,6 @@ PyUnicode_EncodeCodePage(int code_page,
PyObject *
PyUnicode_AsMBCSString(PyObject *unicode)
{
- if (!PyUnicode_Check(unicode)) {
- PyErr_BadArgument();
- return NULL;
- }
return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
}
@@ -8528,10 +8430,10 @@ charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
}
else if (PyLong_Check(x)) {
long value = PyLong_AS_LONG(x);
- long max = PyUnicode_GetMax();
- if (value < 0 || value > max) {
- PyErr_Format(PyExc_TypeError,
- "character mapping must be in range(0x%x)", max+1);
+ if (value < 0 || value > MAX_UNICODE) {
+ PyErr_Format(PyExc_ValueError,
+ "character mapping must be in range(0x%x)",
+ MAX_UNICODE+1);
Py_DECREF(x);
return -1;
}
@@ -8550,76 +8452,168 @@ charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
return -1;
}
}
-/* ensure that *outobj is at least requiredsize characters long,
- if not reallocate and adjust various state variables.
- Return 0 on success, -1 on error */
+
+/* lookup the character, write the result into the writer.
+ Return 1 if the result was written into the writer, return 0 if the mapping
+ was undefined, raise an exception return -1 on error. */
static int
-charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
- Py_ssize_t requiredsize)
-{
- Py_ssize_t oldsize = *psize;
- Py_UCS4 *new_outobj;
- if (requiredsize > oldsize) {
- /* exponentially overallocate to minimize reallocations */
- if (requiredsize < 2 * oldsize)
- requiredsize = 2 * oldsize;
- new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
- if (new_outobj == 0)
+charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
+ _PyUnicodeWriter *writer)
+{
+ PyObject *item;
+
+ if (charmaptranslate_lookup(ch, mapping, &item))
+ return -1;
+
+ if (item == NULL) {
+ /* not found => default to 1:1 mapping */
+ if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
return -1;
- *outobj = new_outobj;
- *psize = requiredsize;
+ }
+ return 1;
}
- return 0;
+
+ if (item == Py_None) {
+ Py_DECREF(item);
+ return 0;
+ }
+
+ if (PyLong_Check(item)) {
+ long ch = (Py_UCS4)PyLong_AS_LONG(item);
+ /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
+ used it */
+ if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
+ Py_DECREF(item);
+ return -1;
+ }
+ Py_DECREF(item);
+ return 1;
+ }
+
+ if (!PyUnicode_Check(item)) {
+ Py_DECREF(item);
+ return -1;
+ }
+
+ if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
+ Py_DECREF(item);
+ return -1;
+ }
+
+ Py_DECREF(item);
+ return 1;
}
-/* lookup the character, put the result in the output string and adjust
- various state variables. Return a new reference to the object that
- was put in the output buffer in *result, or Py_None, if the mapping was
- undefined (in which case no character was written).
- The called must decref result.
- Return 0 on success, -1 on error. */
+
static int
-charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
- PyObject *mapping, Py_UCS4 **output,
- Py_ssize_t *osize, Py_ssize_t *opos,
- PyObject **res)
+unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
+ Py_UCS1 *translate)
{
- Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
- if (charmaptranslate_lookup(curinp, mapping, res))
+ PyObject *item = NULL;
+ int ret = 0;
+
+ if (charmaptranslate_lookup(ch, mapping, &item)) {
return -1;
- if (*res==NULL) {
+ }
+
+ if (item == Py_None) {
+ /* deletion */
+ translate[ch] = 0xfe;
+ }
+ else if (item == NULL) {
/* not found => default to 1:1 mapping */
- (*output)[(*opos)++] = curinp;
+ translate[ch] = ch;
+ return 1;
}
- else if (*res==Py_None)
- ;
- else if (PyLong_Check(*res)) {
- /* no overflow check, because we know that the space is enough */
- (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
+ else if (PyLong_Check(item)) {
+ long replace = PyLong_AS_LONG(item);
+ /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
+ used it */
+ if (127 < replace) {
+ /* invalid character or character outside ASCII:
+ skip the fast translate */
+ goto exit;
+ }
+ translate[ch] = (Py_UCS1)replace;
}
- else if (PyUnicode_Check(*res)) {
- Py_ssize_t repsize;
- if (PyUnicode_READY(*res) == -1)
+ else if (PyUnicode_Check(item)) {
+ Py_UCS4 replace;
+
+ if (PyUnicode_READY(item) == -1) {
+ Py_DECREF(item);
return -1;
- repsize = PyUnicode_GET_LENGTH(*res);
- if (repsize==1) {
- /* no overflow check, because we know that the space is enough */
- (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
}
- else if (repsize!=0) {
- /* more than one character */
- Py_ssize_t requiredsize = *opos +
- (PyUnicode_GET_LENGTH(input) - ipos) +
- repsize - 1;
- Py_ssize_t i;
- if (charmaptranslate_makespace(output, osize, requiredsize))
+ if (PyUnicode_GET_LENGTH(item) != 1)
+ goto exit;
+
+ replace = PyUnicode_READ_CHAR(item, 0);
+ if (replace > 127)
+ goto exit;
+ translate[ch] = (Py_UCS1)replace;
+ }
+ else {
+ /* not None, NULL, long or unicode */
+ goto exit;
+ }
+ ret = 1;
+
+ exit:
+ Py_DECREF(item);
+ return ret;
+}
+
+/* Fast path for ascii => ascii translation. Return 1 if the whole string
+ was translated into writer, return 0 if the input string was partially
+ translated into writer, raise an exception and return -1 on error. */
+static int
+unicode_fast_translate(PyObject *input, PyObject *mapping,
+ _PyUnicodeWriter *writer, int ignore)
+{
+ Py_UCS1 ascii_table[128], ch, ch2;
+ Py_ssize_t len;
+ Py_UCS1 *in, *end, *out;
+ int res = 0;
+
+ if (PyUnicode_READY(input) == -1)
+ return -1;
+ if (!PyUnicode_IS_ASCII(input))
+ return 0;
+ len = PyUnicode_GET_LENGTH(input);
+
+ memset(ascii_table, 0xff, 128);
+
+ in = PyUnicode_1BYTE_DATA(input);
+ end = in + len;
+
+ assert(PyUnicode_IS_ASCII(writer->buffer));
+ assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
+ out = PyUnicode_1BYTE_DATA(writer->buffer);
+
+ for (; in < end; in++) {
+ ch = *in;
+ ch2 = ascii_table[ch];
+ if (ch2 == 0xff) {
+ int translate = unicode_fast_translate_lookup(mapping, ch,
+ ascii_table);
+ if (translate < 0)
return -1;
- for(i = 0; i < repsize; i++)
- (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
+ if (translate == 0)
+ goto exit;
+ ch2 = ascii_table[ch];
}
+ if (ch2 == 0xfe) {
+ if (ignore)
+ continue;
+ goto exit;
+ }
+ assert(ch2 < 128);
+ *out = ch2;
+ out++;
}
- else
- return -1;
- return 0;
+ res = 1;
+
+exit:
+ writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
+ return res;
}
PyObject *
@@ -8628,22 +8622,17 @@ _PyUnicode_TranslateCharmap(PyObject *input,
const char *errors)
{
/* input object */
- char *idata;
+ char *data;
Py_ssize_t size, i;
int kind;
/* output buffer */
- Py_UCS4 *output = NULL;
- Py_ssize_t osize;
- PyObject *res;
- /* current output position */
- Py_ssize_t opos;
+ _PyUnicodeWriter writer;
+ /* error handler */
char *reason = "character maps to <undefined>";
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
- /* the following variable is used for caching string comparisons
- * -1=not initialized, 0=unknown, 1=strict, 2=replace,
- * 3=ignore, 4=xmlcharrefreplace */
- int known_errorHandler = -1;
+ int ignore;
+ int res;
if (mapping == NULL) {
PyErr_BadArgument();
@@ -8652,10 +8641,9 @@ _PyUnicode_TranslateCharmap(PyObject *input,
if (PyUnicode_READY(input) == -1)
return NULL;
- idata = (char*)PyUnicode_DATA(input);
+ data = (char*)PyUnicode_DATA(input);
kind = PyUnicode_KIND(input);
size = PyUnicode_GET_LENGTH(input);
- i = 0;
if (size == 0) {
Py_INCREF(input);
@@ -8664,121 +8652,81 @@ _PyUnicode_TranslateCharmap(PyObject *input,
/* allocate enough for a simple 1:1 translation without
replacements, if we need more, we'll resize */
- osize = size;
- output = PyMem_NEW(Py_UCS4, osize);
- opos = 0;
- if (output == NULL) {
- PyErr_NoMemory();
+ _PyUnicodeWriter_Init(&writer);
+ if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
goto onError;
+
+ ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
+
+ res = unicode_fast_translate(input, mapping, &writer, ignore);
+ if (res < 0) {
+ _PyUnicodeWriter_Dealloc(&writer);
+ return NULL;
}
+ if (res == 1)
+ return _PyUnicodeWriter_Finish(&writer);
+ i = writer.pos;
while (i<size) {
/* try to encode it */
- PyObject *x = NULL;
- if (charmaptranslate_output(input, i, mapping,
- &output, &osize, &opos, &x)) {
- Py_XDECREF(x);
+ int translate;
+ PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
+ Py_ssize_t newpos;
+ /* startpos for collecting untranslatable chars */
+ Py_ssize_t collstart;
+ Py_ssize_t collend;
+ Py_UCS4 ch;
+
+ ch = PyUnicode_READ(kind, data, i);
+ translate = charmaptranslate_output(ch, mapping, &writer);
+ if (translate < 0)
goto onError;
- }
- Py_XDECREF(x);
- if (x!=Py_None) /* it worked => adjust input pointer */
+
+ if (translate != 0) {
+ /* it worked => adjust input pointer */
++i;
- else { /* untranslatable character */
- PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
- Py_ssize_t repsize;
- Py_ssize_t newpos;
- Py_ssize_t uni2;
- /* startpos for collecting untranslatable chars */
- Py_ssize_t collstart = i;
- Py_ssize_t collend = i+1;
- Py_ssize_t coll;
-
- /* find all untranslatable characters */
- while (collend < size) {
- if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
- goto onError;
- Py_XDECREF(x);
- if (x!=Py_None)
- break;
- ++collend;
- }
- /* cache callback name lookup
- * (if not done yet, i.e. it's the first error) */
- if (known_errorHandler==-1) {
- if ((errors==NULL) || (!strcmp(errors, "strict")))
- known_errorHandler = 1;
- else if (!strcmp(errors, "replace"))
- known_errorHandler = 2;
- else if (!strcmp(errors, "ignore"))
- known_errorHandler = 3;
- else if (!strcmp(errors, "xmlcharrefreplace"))
- known_errorHandler = 4;
- else
- known_errorHandler = 0;
- }
- switch (known_errorHandler) {
- case 1: /* strict */
- make_translate_exception(&exc,
- input, collstart, collend, reason);
- if (exc != NULL)
- PyCodec_StrictErrors(exc);
+ continue;
+ }
+
+ /* untranslatable character */
+ collstart = i;
+ collend = i+1;
+
+ /* find all untranslatable characters */
+ while (collend < size) {
+ PyObject *x;
+ ch = PyUnicode_READ(kind, data, collend);
+ if (charmaptranslate_lookup(ch, mapping, &x))
goto onError;
- case 2: /* replace */
- /* No need to check for space, this is a 1:1 replacement */
- for (coll = collstart; coll<collend; coll++)
- output[opos++] = '?';
- /* fall through */
- case 3: /* ignore */
- i = collend;
- break;
- case 4: /* xmlcharrefreplace */
- /* generate replacement (temporarily (mis)uses i) */
- for (i = collstart; i < collend; ++i) {
- char buffer[2+29+1+1];
- char *cp;
- sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
- if (charmaptranslate_makespace(&output, &osize,
- opos+strlen(buffer)+(size-collend)))
- goto onError;
- for (cp = buffer; *cp; ++cp)
- output[opos++] = *cp;
- }
- i = collend;
+ Py_XDECREF(x);
+ if (x != Py_None)
break;
- default:
- repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
- reason, input, &exc,
- collstart, collend, &newpos);
- if (repunicode == NULL)
- goto onError;
- if (PyUnicode_READY(repunicode) == -1) {
- Py_DECREF(repunicode);
- goto onError;
- }
- /* generate replacement */
- repsize = PyUnicode_GET_LENGTH(repunicode);
- if (charmaptranslate_makespace(&output, &osize,
- opos+repsize+(size-collend))) {
- Py_DECREF(repunicode);
- goto onError;
- }
- for (uni2 = 0; repsize-->0; ++uni2)
- output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
- i = newpos;
+ ++collend;
+ }
+
+ if (ignore) {
+ i = collend;
+ }
+ else {
+ repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
+ reason, input, &exc,
+ collstart, collend, &newpos);
+ if (repunicode == NULL)
+ goto onError;
+ if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Py_DECREF(repunicode);
+ goto onError;
}
+ Py_DECREF(repunicode);
+ i = newpos;
}
}
- res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
- if (!res)
- goto onError;
- PyMem_Free(output);
Py_XDECREF(exc);
Py_XDECREF(errorHandler);
- return res;
+ return _PyUnicodeWriter_Finish(&writer);
onError:
- PyMem_Free(output);
+ _PyUnicodeWriter_Dealloc(&writer);
Py_XDECREF(exc);
Py_XDECREF(errorHandler);
return NULL;
@@ -8880,7 +8828,7 @@ PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
maxchar = 127;
for (i = 0; i < length; i++) {
- Py_UNICODE ch = s[i];
+ Py_UCS4 ch = s[i];
if (ch > 127) {
int decimal = Py_UNICODE_TODECIMAL(ch);
if (decimal >= 0)
@@ -8897,7 +8845,7 @@ PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
data = PyUnicode_DATA(decimal);
/* Iterate over code points */
for (i = 0; i < length; i++) {
- Py_UNICODE ch = s[i];
+ Py_UCS4 ch = s[i];
if (ch > 127) {
int decimal = Py_UNICODE_TODECIMAL(ch);
if (decimal >= 0)
@@ -8978,35 +8926,61 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
/* --- Helpers ------------------------------------------------------------ */
+/* helper macro to fixup start/end slice values */
+#define ADJUST_INDICES(start, end, len) \
+ if (end > len) \
+ end = len; \
+ else if (end < 0) { \
+ end += len; \
+ if (end < 0) \
+ end = 0; \
+ } \
+ if (start < 0) { \
+ start += len; \
+ if (start < 0) \
+ start = 0; \
+ }
+
static Py_ssize_t
any_find_slice(int direction, PyObject* s1, PyObject* s2,
Py_ssize_t start,
Py_ssize_t end)
{
- int kind1, kind2, kind;
+ int kind1, kind2;
void *buf1, *buf2;
Py_ssize_t len1, len2, result;
kind1 = PyUnicode_KIND(s1);
kind2 = PyUnicode_KIND(s2);
- kind = kind1 > kind2 ? kind1 : kind2;
+ if (kind1 < kind2)
+ return -1;
+
+ len1 = PyUnicode_GET_LENGTH(s1);
+ len2 = PyUnicode_GET_LENGTH(s2);
+ ADJUST_INDICES(start, end, len1);
+ if (end - start < len2)
+ return -1;
+
buf1 = PyUnicode_DATA(s1);
buf2 = PyUnicode_DATA(s2);
- if (kind1 != kind)
- buf1 = _PyUnicode_AsKind(s1, kind);
- if (!buf1)
- return -2;
- if (kind2 != kind)
- buf2 = _PyUnicode_AsKind(s2, kind);
- if (!buf2) {
- if (kind1 != kind) PyMem_Free(buf1);
- return -2;
+ if (len2 == 1) {
+ Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
+ result = findchar((const char *)buf1 + kind1*start,
+ kind1, end - start, ch, direction);
+ if (result == -1)
+ return -1;
+ else
+ return start + result;
+ }
+
+ if (kind2 != kind1) {
+ buf2 = _PyUnicode_AsKind(s2, kind1);
+ if (!buf2)
+ return -2;
}
- len1 = PyUnicode_GET_LENGTH(s1);
- len2 = PyUnicode_GET_LENGTH(s2);
if (direction > 0) {
- switch (kind) {
+ switch (kind1) {
case PyUnicode_1BYTE_KIND:
if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
@@ -9024,7 +8998,7 @@ any_find_slice(int direction, PyObject* s1, PyObject* s2,
}
}
else {
- switch (kind) {
+ switch (kind1) {
case PyUnicode_1BYTE_KIND:
if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
@@ -9042,9 +9016,7 @@ any_find_slice(int direction, PyObject* s1, PyObject* s2,
}
}
- if (kind1 != kind)
- PyMem_Free(buf1);
- if (kind2 != kind)
+ if (kind2 != kind1)
PyMem_Free(buf2);
return result;
@@ -9134,21 +9106,6 @@ _PyUnicode_InsertThousandsGrouping(
}
-/* helper macro to fixup start/end slice values */
-#define ADJUST_INDICES(start, end, len) \
- if (end > len) \
- end = len; \
- else if (end < 0) { \
- end += len; \
- if (end < 0) \
- end = 0; \
- } \
- if (start < 0) { \
- start += len; \
- if (start < 0) \
- start = 0; \
- }
-
Py_ssize_t
PyUnicode_Count(PyObject *str,
PyObject *substr,
@@ -9158,7 +9115,7 @@ PyUnicode_Count(PyObject *str,
Py_ssize_t result;
PyObject* str_obj;
PyObject* sub_obj;
- int kind1, kind2, kind;
+ int kind1, kind2;
void *buf1 = NULL, *buf2 = NULL;
Py_ssize_t len1, len2;
@@ -9178,24 +9135,30 @@ PyUnicode_Count(PyObject *str,
kind1 = PyUnicode_KIND(str_obj);
kind2 = PyUnicode_KIND(sub_obj);
- kind = kind1;
- buf1 = PyUnicode_DATA(str_obj);
- buf2 = PyUnicode_DATA(sub_obj);
- if (kind2 != kind) {
- if (kind2 > kind) {
- Py_DECREF(sub_obj);
- Py_DECREF(str_obj);
- return 0;
- }
- buf2 = _PyUnicode_AsKind(sub_obj, kind);
+ if (kind1 < kind2) {
+ Py_DECREF(sub_obj);
+ Py_DECREF(str_obj);
+ return 0;
}
- if (!buf2)
- goto onError;
+
len1 = PyUnicode_GET_LENGTH(str_obj);
len2 = PyUnicode_GET_LENGTH(sub_obj);
-
ADJUST_INDICES(start, end, len1);
- switch (kind) {
+ if (end - start < len2) {
+ Py_DECREF(sub_obj);
+ Py_DECREF(str_obj);
+ return 0;
+ }
+
+ buf1 = PyUnicode_DATA(str_obj);
+ buf2 = PyUnicode_DATA(sub_obj);
+ if (kind2 != kind1) {
+ buf2 = _PyUnicode_AsKind(sub_obj, kind1);
+ if (!buf2)
+ goto onError;
+ }
+
+ switch (kind1) {
case PyUnicode_1BYTE_KIND:
if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
result = asciilib_count(
@@ -9227,14 +9190,14 @@ PyUnicode_Count(PyObject *str,
Py_DECREF(sub_obj);
Py_DECREF(str_obj);
- if (kind2 != kind)
+ if (kind2 != kind1)
PyMem_Free(buf2);
return result;
onError:
Py_DECREF(sub_obj);
Py_DECREF(str_obj);
- if (kind2 != kind && buf2)
+ if (kind2 != kind1 && buf2)
PyMem_Free(buf2);
return -1;
}
@@ -9287,6 +9250,8 @@ PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
}
if (end > PyUnicode_GET_LENGTH(str))
end = PyUnicode_GET_LENGTH(str);
+ if (start >= end)
+ return -1;
kind = PyUnicode_KIND(str);
result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
kind, end-start, ch, direction);
@@ -9315,14 +9280,14 @@ tailmatch(PyObject *self,
PyUnicode_READY(substring) == -1)
return -1;
- if (PyUnicode_GET_LENGTH(substring) == 0)
- return 1;
-
ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
end -= PyUnicode_GET_LENGTH(substring);
if (end < start)
return 0;
+ if (PyUnicode_GET_LENGTH(substring) == 0)
+ return 1;
+
kind_self = PyUnicode_KIND(self);
data_self = PyUnicode_DATA(self);
kind_sub = PyUnicode_KIND(substring);
@@ -9474,7 +9439,7 @@ handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
{
Py_ssize_t j;
int final_sigma;
- Py_UCS4 c = 0;
+ Py_UCS4 c = 0; /* initialize to prevent gcc warning */
/* U+03A3 is in the Final_Sigma context when, it is found like this:
\p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
@@ -10033,7 +9998,7 @@ split(PyObject *self,
PyObject *substring,
Py_ssize_t maxcount)
{
- int kind1, kind2, kind;
+ int kind1, kind2;
void *buf1, *buf2;
Py_ssize_t len1, len2;
PyObject* out;
@@ -10077,23 +10042,25 @@ split(PyObject *self,
kind1 = PyUnicode_KIND(self);
kind2 = PyUnicode_KIND(substring);
- kind = kind1 > kind2 ? kind1 : kind2;
+ len1 = PyUnicode_GET_LENGTH(self);
+ len2 = PyUnicode_GET_LENGTH(substring);
+ if (kind1 < kind2 || len1 < len2) {
+ out = PyList_New(1);
+ if (out == NULL)
+ return NULL;
+ Py_INCREF(self);
+ PyList_SET_ITEM(out, 0, self);
+ return out;
+ }
buf1 = PyUnicode_DATA(self);
buf2 = PyUnicode_DATA(substring);
- if (kind1 != kind)
- buf1 = _PyUnicode_AsKind(self, kind);
- if (!buf1)
- return NULL;
- if (kind2 != kind)
- buf2 = _PyUnicode_AsKind(substring, kind);
- if (!buf2) {
- if (kind1 != kind) PyMem_Free(buf1);
- return NULL;
+ if (kind2 != kind1) {
+ buf2 = _PyUnicode_AsKind(substring, kind1);
+ if (!buf2)
+ return NULL;
}
- len1 = PyUnicode_GET_LENGTH(self);
- len2 = PyUnicode_GET_LENGTH(substring);
- switch (kind) {
+ switch (kind1) {
case PyUnicode_1BYTE_KIND:
if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
out = asciilib_split(
@@ -10113,9 +10080,7 @@ split(PyObject *self,
default:
out = NULL;
}
- if (kind1 != kind)
- PyMem_Free(buf1);
- if (kind2 != kind)
+ if (kind2 != kind1)
PyMem_Free(buf2);
return out;
}
@@ -10125,7 +10090,7 @@ rsplit(PyObject *self,
PyObject *substring,
Py_ssize_t maxcount)
{
- int kind1, kind2, kind;
+ int kind1, kind2;
void *buf1, *buf2;
Py_ssize_t len1, len2;
PyObject* out;
@@ -10169,23 +10134,25 @@ rsplit(PyObject *self,
kind1 = PyUnicode_KIND(self);
kind2 = PyUnicode_KIND(substring);
- kind = kind1 > kind2 ? kind1 : kind2;
+ len1 = PyUnicode_GET_LENGTH(self);
+ len2 = PyUnicode_GET_LENGTH(substring);
+ if (kind1 < kind2 || len1 < len2) {
+ out = PyList_New(1);
+ if (out == NULL)
+ return NULL;
+ Py_INCREF(self);
+ PyList_SET_ITEM(out, 0, self);
+ return out;
+ }
buf1 = PyUnicode_DATA(self);
buf2 = PyUnicode_DATA(substring);
- if (kind1 != kind)
- buf1 = _PyUnicode_AsKind(self, kind);
- if (!buf1)
- return NULL;
- if (kind2 != kind)
- buf2 = _PyUnicode_AsKind(substring, kind);
- if (!buf2) {
- if (kind1 != kind) PyMem_Free(buf1);
- return NULL;
+ if (kind2 != kind1) {
+ buf2 = _PyUnicode_AsKind(substring, kind1);
+ if (!buf2)
+ return NULL;
}
- len1 = PyUnicode_GET_LENGTH(self);
- len2 = PyUnicode_GET_LENGTH(substring);
- switch (kind) {
+ switch (kind1) {
case PyUnicode_1BYTE_KIND:
if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
out = asciilib_rsplit(
@@ -10205,9 +10172,7 @@ rsplit(PyObject *self,
default:
out = NULL;
}
- if (kind1 != kind)
- PyMem_Free(buf1);
- if (kind2 != kind)
+ if (kind2 != kind1)
PyMem_Free(buf2);
return out;
}
@@ -10426,7 +10391,7 @@ replace(PyObject *self, PyObject *str1,
}
/* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
PyUnicode_GET_LENGTH(str1))); */
- if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
+ if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
PyErr_SetString(PyExc_OverflowError,
"replace string is too long");
goto error;
@@ -10835,7 +10800,7 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
}
if (len1 > len2)
return 1; /* uni is longer */
- if (len2 > len1)
+ if (len1 < len2)
return -1; /* str is longer */
return 0;
}
@@ -10843,7 +10808,7 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
void *data = PyUnicode_DATA(uni);
/* Compare Unicode string and source character set string */
for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
- if (chr != str[i])
+ if (chr != (unsigned char)str[i])
return (chr < (unsigned char)(str[i])) ? -1 : 1;
/* This check keeps Python strings that end in '\0' from comparing equal
to C strings identical up to that point. */
@@ -10947,23 +10912,35 @@ PyUnicode_Contains(PyObject *container, PyObject *element)
kind1 = PyUnicode_KIND(str);
kind2 = PyUnicode_KIND(sub);
+ if (kind1 < kind2) {
+ Py_DECREF(sub);
+ Py_DECREF(str);
+ return 0;
+ }
+ len1 = PyUnicode_GET_LENGTH(str);
+ len2 = PyUnicode_GET_LENGTH(sub);
+ if (len1 < len2) {
+ Py_DECREF(sub);
+ Py_DECREF(str);
+ return 0;
+ }
buf1 = PyUnicode_DATA(str);
buf2 = PyUnicode_DATA(sub);
+ if (len2 == 1) {
+ Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
+ result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
+ Py_DECREF(sub);
+ Py_DECREF(str);
+ return result;
+ }
if (kind2 != kind1) {
- if (kind2 > kind1) {
+ buf2 = _PyUnicode_AsKind(sub, kind1);
+ if (!buf2) {
Py_DECREF(sub);
Py_DECREF(str);
- return 0;
+ return -1;
}
- buf2 = _PyUnicode_AsKind(sub, kind1);
}
- if (!buf2) {
- Py_DECREF(sub);
- Py_DECREF(str);
- return -1;
- }
- len1 = PyUnicode_GET_LENGTH(str);
- len2 = PyUnicode_GET_LENGTH(sub);
switch (kind1) {
case PyUnicode_1BYTE_KIND:
@@ -11144,11 +11121,11 @@ interpreted as in slice notation.");
static PyObject *
unicode_count(PyObject *self, PyObject *args)
{
- PyObject *substring = NULL;
+ PyObject *substring = NULL; /* initialize to fix a compiler warning */
Py_ssize_t start = 0;
Py_ssize_t end = PY_SSIZE_T_MAX;
PyObject *result;
- int kind1, kind2, kind;
+ int kind1, kind2;
void *buf1, *buf2;
Py_ssize_t len1, len2, iresult;
@@ -11158,24 +11135,27 @@ unicode_count(PyObject *self, PyObject *args)
kind1 = PyUnicode_KIND(self);
kind2 = PyUnicode_KIND(substring);
- if (kind2 > kind1) {
+ if (kind1 < kind2) {
Py_DECREF(substring);
return PyLong_FromLong(0);
}
- kind = kind1;
- buf1 = PyUnicode_DATA(self);
- buf2 = PyUnicode_DATA(substring);
- if (kind2 != kind)
- buf2 = _PyUnicode_AsKind(substring, kind);
- if (!buf2) {
- Py_DECREF(substring);
- return NULL;
- }
len1 = PyUnicode_GET_LENGTH(self);
len2 = PyUnicode_GET_LENGTH(substring);
-
ADJUST_INDICES(start, end, len1);
- switch (kind) {
+ if (end - start < len2) {
+ Py_DECREF(substring);
+ return PyLong_FromLong(0);
+ }
+ buf1 = PyUnicode_DATA(self);
+ buf2 = PyUnicode_DATA(substring);
+ if (kind2 != kind1) {
+ buf2 = _PyUnicode_AsKind(substring, kind1);
+ if (!buf2) {
+ Py_DECREF(substring);
+ return NULL;
+ }
+ }
+ switch (kind1) {
case PyUnicode_1BYTE_KIND:
iresult = ucs1lib_count(
((Py_UCS1*)buf1) + start, end - start,
@@ -11200,7 +11180,7 @@ unicode_count(PyObject *self, PyObject *args)
result = PyLong_FromSsize_t(iresult);
- if (kind2 != kind)
+ if (kind2 != kind1)
PyMem_Free(buf2);
Py_DECREF(substring);
@@ -11332,6 +11312,7 @@ Return -1 on failure.");
static PyObject *
unicode_find(PyObject *self, PyObject *args)
{
+ /* initialize variables to prevent gcc warning */
PyObject *substring = NULL;
Py_ssize_t start = 0;
Py_ssize_t end = 0;
@@ -11419,6 +11400,7 @@ Like S.find() but raise ValueError when the substring is not found.");
static PyObject *
unicode_index(PyObject *self, PyObject *args)
{
+ /* initialize variables to prevent gcc warning */
Py_ssize_t result;
PyObject *substring = NULL;
Py_ssize_t start = 0;
@@ -12497,6 +12479,7 @@ Return -1 on failure.");
static PyObject *
unicode_rfind(PyObject *self, PyObject *args)
{
+ /* initialize variables to prevent gcc warning */
PyObject *substring = NULL;
Py_ssize_t start = 0;
Py_ssize_t end = 0;
@@ -12533,6 +12516,7 @@ Like S.rfind() but raise ValueError when the substring is not found.");
static PyObject *
unicode_rindex(PyObject *self, PyObject *args)
{
+ /* initialize variables to prevent gcc warning */
PyObject *substring = NULL;
Py_ssize_t start = 0;
Py_ssize_t end = 0;
@@ -12647,8 +12631,8 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
PyObject* str_obj;
PyObject* sep_obj;
PyObject* out;
- int kind1, kind2, kind;
- void *buf1 = NULL, *buf2 = NULL;
+ int kind1, kind2;
+ void *buf1, *buf2;
Py_ssize_t len1, len2;
str_obj = PyUnicode_FromObject(str_in);
@@ -12667,21 +12651,29 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
kind1 = PyUnicode_KIND(str_obj);
kind2 = PyUnicode_KIND(sep_obj);
- kind = Py_MAX(kind1, kind2);
- buf1 = PyUnicode_DATA(str_obj);
- if (kind1 != kind)
- buf1 = _PyUnicode_AsKind(str_obj, kind);
- if (!buf1)
- goto onError;
- buf2 = PyUnicode_DATA(sep_obj);
- if (kind2 != kind)
- buf2 = _PyUnicode_AsKind(sep_obj, kind);
- if (!buf2)
- goto onError;
len1 = PyUnicode_GET_LENGTH(str_obj);
len2 = PyUnicode_GET_LENGTH(sep_obj);
+ if (kind1 < kind2 || len1 < len2) {
+ _Py_INCREF_UNICODE_EMPTY();
+ if (!unicode_empty)
+ out = NULL;
+ else {
+ out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
+ Py_DECREF(unicode_empty);
+ }
+ Py_DECREF(sep_obj);
+ Py_DECREF(str_obj);
+ return out;
+ }
+ buf1 = PyUnicode_DATA(str_obj);
+ buf2 = PyUnicode_DATA(sep_obj);
+ if (kind2 != kind1) {
+ buf2 = _PyUnicode_AsKind(sep_obj, kind1);
+ if (!buf2)
+ goto onError;
+ }
- switch (kind) {
+ switch (kind1) {
case PyUnicode_1BYTE_KIND:
if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
@@ -12701,18 +12693,14 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
Py_DECREF(sep_obj);
Py_DECREF(str_obj);
- if (kind1 != kind)
- PyMem_Free(buf1);
- if (kind2 != kind)
+ if (kind2 != kind1)
PyMem_Free(buf2);
return out;
onError:
Py_DECREF(sep_obj);
Py_DECREF(str_obj);
- if (kind1 != kind && buf1)
- PyMem_Free(buf1);
- if (kind2 != kind && buf2)
+ if (kind2 != kind1 && buf2)
PyMem_Free(buf2);
return NULL;
}
@@ -12724,8 +12712,8 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
PyObject* str_obj;
PyObject* sep_obj;
PyObject* out;
- int kind1, kind2, kind;
- void *buf1 = NULL, *buf2 = NULL;
+ int kind1, kind2;
+ void *buf1, *buf2;
Py_ssize_t len1, len2;
str_obj = PyUnicode_FromObject(str_in);
@@ -12739,21 +12727,29 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
kind1 = PyUnicode_KIND(str_obj);
kind2 = PyUnicode_KIND(sep_obj);
- kind = Py_MAX(kind1, kind2);
- buf1 = PyUnicode_DATA(str_obj);
- if (kind1 != kind)
- buf1 = _PyUnicode_AsKind(str_obj, kind);
- if (!buf1)
- goto onError;
- buf2 = PyUnicode_DATA(sep_obj);
- if (kind2 != kind)
- buf2 = _PyUnicode_AsKind(sep_obj, kind);
- if (!buf2)
- goto onError;
len1 = PyUnicode_GET_LENGTH(str_obj);
len2 = PyUnicode_GET_LENGTH(sep_obj);
+ if (kind1 < kind2 || len1 < len2) {
+ _Py_INCREF_UNICODE_EMPTY();
+ if (!unicode_empty)
+ out = NULL;
+ else {
+ out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
+ Py_DECREF(unicode_empty);
+ }
+ Py_DECREF(sep_obj);
+ Py_DECREF(str_obj);
+ return out;
+ }
+ buf1 = PyUnicode_DATA(str_obj);
+ buf2 = PyUnicode_DATA(sep_obj);
+ if (kind2 != kind1) {
+ buf2 = _PyUnicode_AsKind(sep_obj, kind1);
+ if (!buf2)
+ goto onError;
+ }
- switch (kind) {
+ switch (kind1) {
case PyUnicode_1BYTE_KIND:
if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
@@ -12773,18 +12769,14 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
Py_DECREF(sep_obj);
Py_DECREF(str_obj);
- if (kind1 != kind)
- PyMem_Free(buf1);
- if (kind2 != kind)
+ if (kind2 != kind1)
PyMem_Free(buf2);
return out;
onError:
Py_DECREF(sep_obj);
Py_DECREF(str_obj);
- if (kind1 != kind && buf1)
- PyMem_Free(buf1);
- if (kind2 != kind && buf2)
+ if (kind2 != kind1 && buf2)
PyMem_Free(buf2);
return NULL;
}
@@ -12930,47 +12922,9 @@ character at the same position in y. If there is a third argument, it
must be a string, whose characters will be mapped to None in the result.
[clinic start generated code]*/
-PyDoc_STRVAR(unicode_maketrans__doc__,
-"maketrans(x, y=None, z=None, /)\n"
-"--\n"
-"\n"
-"Return a translation table usable for str.translate().\n"
-"\n"
-"If there is only one argument, it must be a dictionary mapping Unicode\n"
-"ordinals (integers) or characters to Unicode ordinals, strings or None.\n"
-"Character keys will be then converted to ordinals.\n"
-"If there are two arguments, they must be strings of equal length, and\n"
-"in the resulting dictionary, each character in x will be mapped to the\n"
-"character at the same position in y. If there is a third argument, it\n"
-"must be a string, whose characters will be mapped to None in the result.");
-
-#define UNICODE_MAKETRANS_METHODDEF \
- {"maketrans", (PyCFunction)unicode_maketrans, METH_VARARGS|METH_STATIC, unicode_maketrans__doc__},
-
-static PyObject *
-unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z);
-
-static PyObject *
-unicode_maketrans(void *null, PyObject *args)
-{
- PyObject *return_value = NULL;
- PyObject *x;
- PyObject *y = NULL;
- PyObject *z = NULL;
-
- if (!PyArg_ParseTuple(args,
- "O|UU:maketrans",
- &x, &y, &z))
- goto exit;
- return_value = unicode_maketrans_impl(x, y, z);
-
-exit:
- return return_value;
-}
-
static PyObject *
unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
-/*[clinic end generated code: output=566edf630f77436a input=7bfbf529a293c6c5]*/
+/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
{
PyObject *new = NULL, *key, *value;
Py_ssize_t i = 0;
@@ -13397,6 +13351,7 @@ _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
maxchar = PyUnicode_MAX_CHAR_VALUE(str);
if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
if (writer->buffer == NULL && !writer->overallocate) {
+ assert(_PyUnicode_CheckConsistency(str, 1));
writer->readonly = 1;
Py_INCREF(str);
writer->buffer = str;
@@ -13905,8 +13860,8 @@ formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
* CAUTION: o, x and X conversions on regular ints can never
* produce a '-' sign, but can for Python's unbounded ints.
*/
-static PyObject*
-formatlong(PyObject *val, struct unicode_format_arg_t *arg)
+PyObject *
+_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
{
PyObject *result = NULL;
char *buf;
@@ -13916,8 +13871,6 @@ formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Py_ssize_t llen;
int numdigits; /* len == numnondigits + numdigits */
int numnondigits = 0;
- int prec = arg->prec;
- int type = arg->ch;
/* Avoid exceeding SSIZE_T_MAX */
if (prec > INT_MAX-3) {
@@ -13966,7 +13919,7 @@ formatlong(PyObject *val, struct unicode_format_arg_t *arg)
if (llen > INT_MAX) {
Py_DECREF(result);
PyErr_SetString(PyExc_ValueError,
- "string too large in _PyBytes_FormatLong");
+ "string too large in _PyUnicode_FormatLong");
return NULL;
}
len = (int)llen;
@@ -13976,7 +13929,7 @@ formatlong(PyObject *val, struct unicode_format_arg_t *arg)
assert(numdigits > 0);
/* Get rid of base marker unless F_ALT */
- if (((arg->flags & F_ALT) == 0 &&
+ if (((alt) == 0 &&
(type == 'o' || type == 'x' || type == 'X'))) {
assert(buf[sign] == '0');
assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
@@ -14051,24 +14004,14 @@ mainformatlong(PyObject *v,
if (!PyNumber_Check(v))
goto wrongtype;
- /* make sure number is a type of integer */
- /* if not, issue deprecation warning for now */
+ /* make sure number is a type of integer for o, x, and X */
if (!PyLong_Check(v)) {
if (type == 'o' || type == 'x' || type == 'X') {
iobj = PyNumber_Index(v);
if (iobj == NULL) {
- PyErr_Clear();
- if (PyErr_WarnEx(PyExc_DeprecationWarning,
- "automatic int conversions have been deprecated",
- 1)) {
- return -1;
- }
- iobj = PyNumber_Long(v);
- if (iobj == NULL ) {
- if (PyErr_ExceptionMatches(PyExc_TypeError))
- goto wrongtype;
- return -1;
- }
+ if (PyErr_ExceptionMatches(PyExc_TypeError))
+ goto wrongtype;
+ return -1;
}
}
else {
@@ -14121,7 +14064,7 @@ mainformatlong(PyObject *v,
return 1;
}
- res = formatlong(iobj, arg);
+ res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Py_DECREF(iobj);
if (res == NULL)
return -1;
@@ -14129,10 +14072,23 @@ mainformatlong(PyObject *v,
return 0;
wrongtype:
- PyErr_Format(PyExc_TypeError,
- "%%%c format: a number is required, "
- "not %.200s",
- type, Py_TYPE(v)->tp_name);
+ switch(type)
+ {
+ case 'o':
+ case 'x':
+ case 'X':
+ PyErr_Format(PyExc_TypeError,
+ "%%%c format: an integer is required, "
+ "not %.200s",
+ type, Py_TYPE(v)->tp_name);
+ break;
+ default:
+ PyErr_Format(PyExc_TypeError,
+ "%%%c format: a number is required, "
+ "not %.200s",
+ type, Py_TYPE(v)->tp_name);
+ break;
+ }
return -1;
}
@@ -14150,22 +14106,10 @@ formatchar(PyObject *v)
PyObject *iobj;
long x;
/* make sure number is a type of integer */
- /* if not, issue deprecation warning for now */
if (!PyLong_Check(v)) {
iobj = PyNumber_Index(v);
if (iobj == NULL) {
- PyErr_Clear();
- if (PyErr_WarnEx(PyExc_DeprecationWarning,
- "automatic int conversions have been deprecated",
- 1)) {
- return -1;
- }
- iobj = PyNumber_Long(v);
- if (iobj == NULL ) {
- if (PyErr_ExceptionMatches(PyExc_TypeError))
- goto onError;
- return -1;
- }
+ goto onError;
}
v = iobj;
Py_DECREF(iobj);
@@ -14997,13 +14941,6 @@ int _PyUnicode_Init(void)
if (PyType_Ready(&PyFormatterIter_Type) < 0)
Py_FatalError("Can't initialize formatter iter type");
-#ifdef HAVE_MBCS
- winver.dwOSVersionInfoSize = sizeof(winver);
- if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
- PyErr_SetFromWindowsErr(0);
- return -1;
- }
-#endif
return 0;
}
@@ -15419,7 +15356,7 @@ PyUnicode_AsUnicodeCopy(PyObject *unicode)
if (u == NULL)
return NULL;
/* Ensure we won't overflow the size. */
- if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
+ if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
PyErr_NoMemory();
return NULL;
}