summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Objects/stringlib/codecs.h18
-rw-r--r--Objects/unicodeobject.c193
2 files changed, 160 insertions, 51 deletions
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h
index d7a9918..ae99d1a 100644
--- a/Objects/stringlib/codecs.h
+++ b/Objects/stringlib/codecs.h
@@ -334,7 +334,6 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
i += (endpos - startpos - 1);
break;
-
case _Py_ERROR_SURROGATEPASS:
for (k=startpos; k<endpos; k++) {
ch = data[k];
@@ -345,6 +344,22 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
i += (endpos - startpos - 1);
break;
+ case _Py_ERROR_BACKSLASHREPLACE:
+ p = backslashreplace(&writer, max_char_size, p,
+ unicode, startpos, endpos);
+ if (p == NULL)
+ goto error;
+ i += (endpos - startpos - 1);
+ break;
+
+ case _Py_ERROR_XMLCHARREFREPLACE:
+ p = xmlcharrefreplace(&writer, max_char_size, p,
+ unicode, startpos, endpos);
+ if (p == NULL)
+ goto error;
+ i += (endpos - startpos - 1);
+ break;
+
case _Py_ERROR_SURROGATEESCAPE:
for (k=startpos; k<endpos; k++) {
ch = data[k];
@@ -359,7 +374,6 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
startpos = k;
assert(startpos < endpos);
/* fall through the default handler */
-
default:
rep = unicode_encode_call_errorhandler(
errors, &error_handler_obj, "utf-8", "surrogates not allowed",
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 010a610..e0b3c68 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -305,9 +305,10 @@ typedef enum {
_Py_ERROR_UNKNOWN=0,
_Py_ERROR_STRICT,
_Py_ERROR_SURROGATEESCAPE,
- _Py_ERROR_SURROGATEPASS,
_Py_ERROR_REPLACE,
_Py_ERROR_IGNORE,
+ _Py_ERROR_BACKSLASHREPLACE,
+ _Py_ERROR_SURROGATEPASS,
_Py_ERROR_XMLCHARREFREPLACE,
_Py_ERROR_OTHER
} _Py_error_handler;
@@ -315,18 +316,18 @@ typedef enum {
static _Py_error_handler
get_error_handler(const char *errors)
{
- if (errors == NULL)
- return _Py_ERROR_STRICT;
- if (strcmp(errors, "strict") == 0)
+ if (errors == NULL || strcmp(errors, "strict") == 0)
return _Py_ERROR_STRICT;
if (strcmp(errors, "surrogateescape") == 0)
return _Py_ERROR_SURROGATEESCAPE;
- if (strcmp(errors, "surrogatepass") == 0)
- return _Py_ERROR_SURROGATEPASS;
- if (strcmp(errors, "ignore") == 0)
- return _Py_ERROR_IGNORE;
if (strcmp(errors, "replace") == 0)
return _Py_ERROR_REPLACE;
+ if (strcmp(errors, "ignore") == 0)
+ return _Py_ERROR_IGNORE;
+ if (strcmp(errors, "backslashreplace") == 0)
+ return _Py_ERROR_BACKSLASHREPLACE;
+ if (strcmp(errors, "surrogatepass") == 0)
+ return _Py_ERROR_SURROGATEPASS;
if (strcmp(errors, "xmlcharrefreplace") == 0)
return _Py_ERROR_XMLCHARREFREPLACE;
return _Py_ERROR_OTHER;
@@ -771,6 +772,126 @@ unicode_result_unchanged(PyObject *unicode)
return _PyUnicode_Copy(unicode);
}
+/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
+ ASCII, Latin1, UTF-8, etc. */
+static char*
+backslashreplace(_PyBytesWriter *writer, Py_ssize_t prealloc_per_char,
+ char *str,
+ PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
+{
+ Py_ssize_t size, i, prealloc;
+ Py_UCS4 ch;
+ enum PyUnicode_Kind kind;
+ void *data;
+
+ assert(PyUnicode_IS_READY(unicode));
+ kind = PyUnicode_KIND(unicode);
+ data = PyUnicode_DATA(unicode);
+
+ size = 0;
+ /* determine replacement size */
+ for (i = collstart; i < collend; ++i) {
+ Py_ssize_t incr;
+
+ ch = PyUnicode_READ(kind, data, i);
+ if (ch < 0x100)
+ incr = 2+2;
+ else if (ch < 0x10000)
+ incr = 2+4;
+ else {
+ assert(ch <= MAX_UNICODE);
+ incr = 2+6;
+ }
+ if (size > PY_SSIZE_T_MAX - incr) {
+ PyErr_SetString(PyExc_OverflowError,
+ "encoded result is too long for a Python string");
+ return NULL;
+ }
+ size += incr;
+ }
+
+ prealloc = prealloc_per_char * (collend - collstart);
+ if (size > prealloc) {
+ str = _PyBytesWriter_Prepare(writer, str, size - prealloc);
+ if (str == NULL)
+ return NULL;
+ }
+
+ /* generate replacement */
+ for (i = collstart; i < collend; ++i) {
+ ch = PyUnicode_READ(kind, data, i);
+ if (ch < 0x100)
+ str += sprintf(str, "\\x%02x", ch);
+ else if (ch < 0x10000)
+ str += sprintf(str, "\\u%04x", ch);
+ else {
+ assert(ch <= MAX_UNICODE);
+ str += sprintf(str, "\\U%08x", ch);
+ }
+ }
+ return str;
+}
+
+/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
+ ASCII, Latin1, UTF-8, etc. */
+static char*
+xmlcharrefreplace(_PyBytesWriter *writer, Py_ssize_t prealloc_per_char,
+ char *str,
+ PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
+{
+ Py_ssize_t size, i, prealloc;
+ Py_UCS4 ch;
+ enum PyUnicode_Kind kind;
+ void *data;
+
+ assert(PyUnicode_IS_READY(unicode));
+ kind = PyUnicode_KIND(unicode);
+ data = PyUnicode_DATA(unicode);
+
+ size = 0;
+ /* determine replacement size */
+ for (i = collstart; i < collend; ++i) {
+ Py_ssize_t incr;
+
+ ch = PyUnicode_READ(kind, data, i);
+ if (ch < 10)
+ incr = 2+1+1;
+ else if (ch < 100)
+ incr = 2+2+1;
+ else if (ch < 1000)
+ incr = 2+3+1;
+ else if (ch < 10000)
+ incr = 2+4+1;
+ else if (ch < 100000)
+ incr = 2+5+1;
+ else if (ch < 1000000)
+ incr = 2+6+1;
+ else {
+ assert(ch <= MAX_UNICODE);
+ incr = 2+7+1;
+ }
+ if (size > PY_SSIZE_T_MAX - incr) {
+ PyErr_SetString(PyExc_OverflowError,
+ "encoded result is too long for a Python string");
+ return NULL;
+ }
+ size += incr;
+ }
+
+ prealloc = prealloc_per_char * (collend - collstart);
+ if (size > prealloc) {
+ str = _PyBytesWriter_Prepare(writer, str, size - prealloc);
+ if (str == NULL)
+ return NULL;
+ }
+
+ /* generate replacement */
+ for (i = collstart; i < collend; ++i) {
+ str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
+ }
+ return str;
+}
+
/* --- Bloom Filters ----------------------------------------------------- */
/* stuff to implement simple "bloom filters" for Unicode characters.
@@ -6713,7 +6834,6 @@ unicode_encode_ucs1(PyObject *unicode,
++pos;
}
else {
- Py_ssize_t requiredsize;
PyObject *repunicode;
Py_ssize_t repsize, newpos, i;
/* startpos for collecting unencodable chars */
@@ -6744,42 +6864,19 @@ unicode_encode_ucs1(PyObject *unicode,
pos = collend;
break;
- case _Py_ERROR_XMLCHARREFREPLACE:
- requiredsize = 0;
- /* determine replacement size */
- for (i = collstart; i < collend; ++i) {
- Py_ssize_t incr;
-
- ch = PyUnicode_READ(kind, data, i);
- if (ch < 10)
- incr = 2+1+1;
- else if (ch < 100)
- incr = 2+2+1;
- else if (ch < 1000)
- incr = 2+3+1;
- else if (ch < 10000)
- incr = 2+4+1;
- else if (ch < 100000)
- incr = 2+5+1;
- else if (ch < 1000000)
- incr = 2+6+1;
- else {
- assert(ch <= MAX_UNICODE);
- incr = 2+7+1;
- }
- if (requiredsize > PY_SSIZE_T_MAX - incr)
- goto overflow;
- requiredsize += incr;
- }
-
- str = _PyBytesWriter_Prepare(&writer, str, requiredsize-1);
+ case _Py_ERROR_BACKSLASHREPLACE:
+ str = backslashreplace(&writer, 1, str,
+ unicode, collstart, collend);
if (str == NULL)
goto onError;
+ pos = collend;
+ break;
- /* generate replacement */
- for (i = collstart; i < collend; ++i) {
- str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
- }
+ case _Py_ERROR_XMLCHARREFREPLACE:
+ str = xmlcharrefreplace(&writer, 1, str,
+ unicode, collstart, collend);
+ if (str == NULL)
+ goto onError;
pos = collend;
break;
@@ -6810,9 +6907,11 @@ unicode_encode_ucs1(PyObject *unicode,
if (PyBytes_Check(repunicode)) {
/* Directly copy bytes result to output. */
repsize = PyBytes_Size(repunicode);
- str = _PyBytesWriter_Prepare(&writer, str, repsize-1);
- if (str == NULL)
- goto onError;
+ if (repsize > 1) {
+ str = _PyBytesWriter_Prepare(&writer, str, repsize-1);
+ if (str == NULL)
+ goto onError;
+ }
memcpy(str, PyBytes_AsString(repunicode), repsize);
str += repsize;
pos = newpos;
@@ -6856,10 +6955,6 @@ unicode_encode_ucs1(PyObject *unicode,
Py_XDECREF(exc);
return _PyBytesWriter_Finish(&writer, str);
- overflow:
- PyErr_SetString(PyExc_OverflowError,
- "encoded result is too long for a Python string");
-
onError:
_PyBytesWriter_Dealloc(&writer);
Py_XDECREF(error_handler_obj);