summaryrefslogtreecommitdiffstats
path: root/Objects
diff options
context:
space:
mode:
Diffstat (limited to 'Objects')
-rw-r--r--Objects/unicodeobject.c211
1 files changed, 127 insertions, 84 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 0a33ece..0ecbd1d 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -248,7 +248,7 @@ _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
static PyObject *
unicode_encode_call_errorhandler(const char *errors,
PyObject **errorHandler,const char *encoding, const char *reason,
- const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
+ PyObject *unicode, PyObject **exceptionObject,
Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
static void
@@ -4745,8 +4745,7 @@ _PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
#endif
rep = unicode_encode_call_errorhandler(
errors, &errorHandler, "utf-8", "surrogates not allowed",
- PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
- &exc, startpos, startpos+1, &newpos);
+ obj, &exc, startpos, startpos+1, &newpos);
if (!rep)
goto error;
@@ -6450,7 +6449,7 @@ make_encode_exception_obj(PyObject **exceptionObject,
{
if (*exceptionObject == NULL) {
*exceptionObject = PyObject_CallFunction(
- PyExc_UnicodeEncodeError, "sUnns",
+ PyExc_UnicodeEncodeError, "sOnns",
encoding, unicode, startpos, endpos, reason);
}
else {
@@ -6502,12 +6501,12 @@ static PyObject *
unicode_encode_call_errorhandler(const char *errors,
PyObject **errorHandler,
const char *encoding, const char *reason,
- const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
+ PyObject *unicode, PyObject **exceptionObject,
Py_ssize_t startpos, Py_ssize_t endpos,
Py_ssize_t *newpos)
{
static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
-
+ Py_ssize_t len;
PyObject *restuple;
PyObject *resunicode;
@@ -6517,8 +6516,12 @@ unicode_encode_call_errorhandler(const char *errors,
return NULL;
}
- make_encode_exception(exceptionObject,
- encoding, unicode, size, startpos, endpos, reason);
+ if (PyUnicode_READY(unicode) < 0)
+ return NULL;
+ len = PyUnicode_GET_LENGTH(unicode);
+
+ make_encode_exception_obj(exceptionObject,
+ encoding, unicode, startpos, endpos, reason);
if (*exceptionObject == NULL)
return NULL;
@@ -6542,8 +6545,8 @@ unicode_encode_call_errorhandler(const char *errors,
return NULL;
}
if (*newpos<0)
- *newpos = size+*newpos;
- if (*newpos<0 || *newpos>size) {
+ *newpos = len + *newpos;
+ if (*newpos<0 || *newpos>len) {
PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Py_DECREF(restuple);
return NULL;
@@ -6554,18 +6557,16 @@ unicode_encode_call_errorhandler(const char *errors,
}
static PyObject *
-unicode_encode_ucs1(const Py_UNICODE *p,
- Py_ssize_t size,
+unicode_encode_ucs1(PyObject *unicode,
const char *errors,
int limit)
{
+ /* input state */
+ Py_ssize_t pos=0, size;
+ int kind;
+ void *data;
/* output object */
PyObject *res;
- /* pointers to the beginning and end+1 of input */
- const Py_UNICODE *startp = p;
- const Py_UNICODE *endp = p + size;
- /* pointer to the beginning of the unencodable characters */
- /* const Py_UNICODE *badp = NULL; */
/* pointer into the output */
char *str;
/* current output position */
@@ -6578,6 +6579,11 @@ unicode_encode_ucs1(const Py_UNICODE *p,
* -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
int known_errorHandler = -1;
+ if (PyUnicode_READY(unicode) < 0)
+ return NULL;
+ size = PyUnicode_GET_LENGTH(unicode);
+ kind = PyUnicode_KIND(unicode);
+ data = PyUnicode_DATA(unicode);
/* allocate enough for a simple encoding without
replacements, if we need more, we'll resize */
if (size == 0)
@@ -6588,28 +6594,24 @@ unicode_encode_ucs1(const Py_UNICODE *p,
str = PyBytes_AS_STRING(res);
ressize = size;
- while (p<endp) {
- Py_UNICODE c = *p;
+ while (pos < size) {
+ Py_UCS4 c = PyUnicode_READ(kind, data, pos);
/* can we encode this? */
if (c<limit) {
/* no overflow check, because we know that the space is enough */
*str++ = (char)c;
- ++p;
+ ++pos;
}
else {
- Py_ssize_t unicodepos = p-startp;
Py_ssize_t requiredsize;
PyObject *repunicode;
- Py_ssize_t repsize;
- Py_ssize_t newpos;
- Py_ssize_t respos;
- Py_UNICODE *uni2;
+ Py_ssize_t repsize, newpos, respos, i;
/* startpos for collecting unencodable chars */
- const Py_UNICODE *collstart = p;
- const Py_UNICODE *collend = p;
+ Py_ssize_t collstart = pos;
+ Py_ssize_t collend = pos;
/* find all unecodable characters */
- while ((collend < endp) && ((*collend)>=limit))
+ while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
++collend;
/* cache callback name lookup (if not done yet, i.e. it's the first error) */
if (known_errorHandler==-1) {
@@ -6626,39 +6628,40 @@ unicode_encode_ucs1(const Py_UNICODE *p,
}
switch (known_errorHandler) {
case 1: /* strict */
- raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
+ raise_encode_exception_obj(&exc, encoding, unicode, collstart, collend, reason);
goto onError;
case 2: /* replace */
while (collstart++<collend)
*str++ = '?'; /* fall through */
case 3: /* ignore */
- p = collend;
+ pos = collend;
break;
case 4: /* xmlcharrefreplace */
respos = str - PyBytes_AS_STRING(res);
- /* determine replacement size (temporarily (mis)uses p) */
- for (p = collstart, repsize = 0; p < collend; ++p) {
- if (*p<10)
+ /* determine replacement size */
+ for (i = collstart, repsize = 0; i < collend; ++i) {
+ Py_UCS4 ch = PyUnicode_READ(kind, data, i);
+ if (ch < 10)
repsize += 2+1+1;
- else if (*p<100)
+ else if (ch < 100)
repsize += 2+2+1;
- else if (*p<1000)
+ else if (ch < 1000)
repsize += 2+3+1;
- else if (*p<10000)
+ else if (ch < 10000)
repsize += 2+4+1;
#ifndef Py_UNICODE_WIDE
else
repsize += 2+5+1;
#else
- else if (*p<100000)
+ else if (ch < 100000)
repsize += 2+5+1;
- else if (*p<1000000)
+ else if (ch < 1000000)
repsize += 2+6+1;
else
repsize += 2+7+1;
#endif
}
- requiredsize = respos+repsize+(endp-collend);
+ requiredsize = respos+repsize+(size-collend);
if (requiredsize > ressize) {
if (requiredsize<2*ressize)
requiredsize = 2*ressize;
@@ -6667,17 +6670,18 @@ unicode_encode_ucs1(const Py_UNICODE *p,
str = PyBytes_AS_STRING(res) + respos;
ressize = requiredsize;
}
- /* generate replacement (temporarily (mis)uses p) */
- for (p = collstart; p < collend; ++p) {
- str += sprintf(str, "&#%d;", (int)*p);
+ /* generate replacement */
+ for (i = collstart; i < collend; ++i) {
+ str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
}
- p = collend;
+ pos = collend;
break;
default:
repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
- encoding, reason, startp, size, &exc,
- collstart-startp, collend-startp, &newpos);
- if (repunicode == NULL)
+ encoding, reason, unicode, &exc,
+ collstart, collend, &newpos);
+ if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
+ PyUnicode_READY(repunicode) < 0))
goto onError;
if (PyBytes_Check(repunicode)) {
/* Directly copy bytes result to output. */
@@ -6694,7 +6698,7 @@ unicode_encode_ucs1(const Py_UNICODE *p,
}
memcpy(str, PyBytes_AsString(repunicode), repsize);
str += repsize;
- p = startp + newpos;
+ pos = newpos;
Py_DECREF(repunicode);
break;
}
@@ -6702,8 +6706,8 @@ unicode_encode_ucs1(const Py_UNICODE *p,
have+the replacement+the rest of the string, so
we won't have to check space for encodable characters) */
respos = str - PyBytes_AS_STRING(res);
- repsize = PyUnicode_GET_SIZE(repunicode);
- requiredsize = respos+repsize+(endp-collend);
+ repsize = PyUnicode_GET_LENGTH(repunicode);
+ requiredsize = respos+repsize+(size-collend);
if (requiredsize > ressize) {
if (requiredsize<2*ressize)
requiredsize = 2*ressize;
@@ -6716,17 +6720,17 @@ unicode_encode_ucs1(const Py_UNICODE *p,
}
/* check if there is anything unencodable in the replacement
and copy it to the output */
- for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
- c = *uni2;
+ for (i = 0; repsize-->0; ++i, ++str) {
+ c = PyUnicode_READ_CHAR(repunicode, i);
if (c >= limit) {
- raise_encode_exception(&exc, encoding, startp, size,
- unicodepos, unicodepos+1, reason);
+ raise_encode_exception_obj(&exc, encoding, unicode,
+ pos, pos+1, reason);
Py_DECREF(repunicode);
goto onError;
}
*str = (char)c;
}
- p = startp + newpos;
+ pos = newpos;
Py_DECREF(repunicode);
}
}
@@ -6750,12 +6754,19 @@ unicode_encode_ucs1(const Py_UNICODE *p,
return NULL;
}
+/* Deprecated */
PyObject *
PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Py_ssize_t size,
const char *errors)
{
- return unicode_encode_ucs1(p, size, errors, 256);
+ PyObject *result;
+ PyObject *unicode = PyUnicode_FromUnicode(p, size);
+ if (unicode == NULL)
+ return NULL;
+ result = unicode_encode_ucs1(unicode, errors, 256);
+ Py_DECREF(unicode);
+ return result;
}
PyObject *
@@ -6774,9 +6785,7 @@ _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
PyUnicode_GET_LENGTH(unicode));
/* Non-Latin-1 characters present. Defer to above function to
raise the exception. */
- return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
- PyUnicode_GET_SIZE(unicode),
- errors);
+ return unicode_encode_ucs1(unicode, errors, 256);
}
PyObject*
@@ -6888,12 +6897,19 @@ PyUnicode_DecodeASCII(const char *s,
return NULL;
}
+/* Deprecated */
PyObject *
PyUnicode_EncodeASCII(const Py_UNICODE *p,
Py_ssize_t size,
const char *errors)
{
- return unicode_encode_ucs1(p, size, errors, 128);
+ PyObject *result;
+ PyObject *unicode = PyUnicode_FromUnicode(p, size);
+ if (unicode == NULL)
+ return NULL;
+ result = unicode_encode_ucs1(unicode, errors, 128);
+ Py_DECREF(unicode);
+ return result;
}
PyObject *
@@ -6910,9 +6926,7 @@ _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
PyUnicode_GET_LENGTH(unicode));
- return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
- PyUnicode_GET_SIZE(unicode),
- errors);
+ return unicode_encode_ucs1(unicode, errors, 128);
}
PyObject *
@@ -8182,13 +8196,13 @@ charmapencode_output(Py_UNICODE c, PyObject *mapping,
Return 0 on success, -1 on error */
static int
charmap_encoding_error(
- const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
+ PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
PyObject **exceptionObject,
int *known_errorHandler, PyObject **errorHandler, const char *errors,
PyObject **res, Py_ssize_t *respos)
{
PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
- Py_ssize_t repsize;
+ Py_ssize_t size, repsize;
Py_ssize_t newpos;
Py_UNICODE *uni2;
/* startpos for collecting unencodable chars */
@@ -8198,19 +8212,25 @@ charmap_encoding_error(
char *encoding = "charmap";
char *reason = "character maps to <undefined>";
charmapencode_result x;
+ Py_UCS4 ch;
+ if (PyUnicode_READY(unicode) < 0)
+ return -1;
+ size = PyUnicode_GET_LENGTH(unicode);
/* find all unencodable characters */
while (collendpos < size) {
PyObject *rep;
if (Py_TYPE(mapping) == &EncodingMapType) {
- int res = encoding_map_lookup(p[collendpos], mapping);
+ ch = PyUnicode_READ_CHAR(unicode, collendpos);
+ int res = encoding_map_lookup(ch, mapping);
if (res != -1)
break;
++collendpos;
continue;
}
- rep = charmapencode_lookup(p[collendpos], mapping);
+ ch = PyUnicode_READ_CHAR(unicode, collendpos);
+ rep = charmapencode_lookup(ch, mapping);
if (rep==NULL)
return -1;
else if (rep!=Py_None) {
@@ -8236,7 +8256,7 @@ charmap_encoding_error(
}
switch (*known_errorHandler) {
case 1: /* strict */
- raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
+ raise_encode_exception_obj(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
return -1;
case 2: /* replace */
for (collpos = collstartpos; collpos<collendpos; ++collpos) {
@@ -8245,7 +8265,7 @@ charmap_encoding_error(
return -1;
}
else if (x==enc_FAILED) {
- raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
+ raise_encode_exception_obj(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
return -1;
}
}
@@ -8258,13 +8278,13 @@ charmap_encoding_error(
for (collpos = collstartpos; collpos < collendpos; ++collpos) {
char buffer[2+29+1+1];
char *cp;
- sprintf(buffer, "&#%d;", (int)p[collpos]);
+ sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
for (cp = buffer; *cp; ++cp) {
x = charmapencode_output(*cp, mapping, res, respos);
if (x==enc_EXCEPTION)
return -1;
else if (x==enc_FAILED) {
- raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
+ raise_encode_exception_obj(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
return -1;
}
}
@@ -8273,7 +8293,7 @@ charmap_encoding_error(
break;
default:
repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
- encoding, reason, p, size, exceptionObject,
+ encoding, reason, unicode, exceptionObject,
collstartpos, collendpos, &newpos);
if (repunicode == NULL)
return -1;
@@ -8305,7 +8325,7 @@ charmap_encoding_error(
}
else if (x==enc_FAILED) {
Py_DECREF(repunicode);
- raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
+ raise_encode_exception_obj(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
return -1;
}
}
@@ -8316,15 +8336,15 @@ charmap_encoding_error(
}
PyObject *
-PyUnicode_EncodeCharmap(const Py_UNICODE *p,
- Py_ssize_t size,
- PyObject *mapping,
- const char *errors)
+_PyUnicode_EncodeCharmap(PyObject *unicode,
+ PyObject *mapping,
+ const char *errors)
{
/* output object */
PyObject *res = NULL;
/* current input position */
Py_ssize_t inpos = 0;
+ Py_ssize_t size;
/* current output position */
Py_ssize_t respos = 0;
PyObject *errorHandler = NULL;
@@ -8334,9 +8354,13 @@ PyUnicode_EncodeCharmap(const Py_UNICODE *p,
* 3=ignore, 4=xmlcharrefreplace */
int known_errorHandler = -1;
+ if (PyUnicode_READY(unicode) < 0)
+ return NULL;
+ size = PyUnicode_GET_LENGTH(unicode);
+
/* Default to Latin-1 */
if (mapping == NULL)
- return PyUnicode_EncodeLatin1(p, size, errors);
+ return unicode_encode_ucs1(unicode, errors, 256);
/* allocate enough for a simple encoding without
replacements, if we need more, we'll resize */
@@ -8347,12 +8371,13 @@ PyUnicode_EncodeCharmap(const Py_UNICODE *p,
return res;
while (inpos<size) {
+ Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
/* try to encode it */
- charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
+ charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
if (x==enc_EXCEPTION) /* error */
goto onError;
if (x==enc_FAILED) { /* unencodable character */
- if (charmap_encoding_error(p, size, &inpos, mapping,
+ if (charmap_encoding_error(unicode, &inpos, mapping,
&exc,
&known_errorHandler, &errorHandler, errors,
&res, &respos)) {
@@ -8380,6 +8405,22 @@ PyUnicode_EncodeCharmap(const Py_UNICODE *p,
return NULL;
}
+/* Deprecated */
+PyObject *
+PyUnicode_EncodeCharmap(const Py_UNICODE *p,
+ Py_ssize_t size,
+ PyObject *mapping,
+ const char *errors)
+{
+ PyObject *result;
+ PyObject *unicode = PyUnicode_FromUnicode(p, size);
+ if (unicode == NULL)
+ return NULL;
+ result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
+ Py_DECREF(unicode);
+ return NULL;
+}
+
PyObject *
PyUnicode_AsCharmapString(PyObject *unicode,
PyObject *mapping)
@@ -8388,10 +8429,7 @@ PyUnicode_AsCharmapString(PyObject *unicode,
PyErr_BadArgument();
return NULL;
}
- return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
- PyUnicode_GET_SIZE(unicode),
- mapping,
- NULL);
+ return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
}
/* create or adjust a UnicodeTranslateError */
@@ -8893,6 +8931,7 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
Py_UNICODE *p, *end;
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
+ PyObject *unicode;
const char *encoding = "decimal";
const char *reason = "invalid decimal Unicode string";
/* the following variable is used for caching string comparisons
@@ -8973,9 +9012,13 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
p = collend;
break;
default:
+ unicode = PyUnicode_FromUnicode(s, length);
+ if (unicode == NULL)
+ goto onError;
repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
- encoding, reason, s, length, &exc,
+ encoding, reason, unicode, &exc,
collstart-s, collend-s, &newpos);
+ Py_DECREF(unicode);
if (repunicode == NULL)
goto onError;
if (!PyUnicode_Check(repunicode)) {