diff options
author | Victor Stinner <victor.stinner@gmail.com> | 2013-04-17 23:44:27 (GMT) |
---|---|---|
committer | Victor Stinner <victor.stinner@gmail.com> | 2013-04-17 23:44:27 (GMT) |
commit | fb161b1b6d5234fca20b04b374645eaf896a4b38 (patch) | |
tree | 8b91fa6086a4ffcf0448d549377f05f932c01f0a /Objects | |
parent | 33a3c50db58f5075b8860dc4d3bf6cf7a0165dd2 (diff) | |
download | cpython-fb161b1b6d5234fca20b04b374645eaf896a4b38.zip cpython-fb161b1b6d5234fca20b04b374645eaf896a4b38.tar.gz cpython-fb161b1b6d5234fca20b04b374645eaf896a4b38.tar.bz2 |
Split PyUnicode_DecodeCharmap() into subfunction for readability
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodeobject.c | 391 |
1 files changed, 213 insertions, 178 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 9a59f38..076674c 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -7265,222 +7265,257 @@ PyUnicode_AsMBCSString(PyObject *unicode) /* --- Character Mapping Codec -------------------------------------------- */ -PyObject * -PyUnicode_DecodeCharmap(const char *s, - Py_ssize_t size, - PyObject *mapping, - const char *errors) +static int +charmap_decode_string(const char *s, + Py_ssize_t size, + PyObject *mapping, + const char *errors, + _PyUnicodeWriter *writer) { const char *starts = s; - Py_ssize_t startinpos; - Py_ssize_t endinpos; const char *e; - _PyUnicodeWriter writer; - PyObject *errorHandler = NULL; - PyObject *exc = NULL; - - /* Default to Latin-1 */ - if (mapping == NULL) - return PyUnicode_DecodeLatin1(s, size, errors); + Py_ssize_t startinpos, endinpos; + PyObject *errorHandler = NULL, *exc = NULL; + Py_ssize_t maplen; + enum PyUnicode_Kind mapkind; + void *mapdata; + Py_UCS4 x; + unsigned char ch; + + if (PyUnicode_READY(mapping) == -1) + return -1; - if (size == 0) - _Py_RETURN_UNICODE_EMPTY(); - _PyUnicodeWriter_Init(&writer); - writer.min_length = size; - if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) - goto onError; + maplen = PyUnicode_GET_LENGTH(mapping); + mapdata = PyUnicode_DATA(mapping); + mapkind = PyUnicode_KIND(mapping); e = s + size; - if (PyUnicode_CheckExact(mapping)) { - Py_ssize_t maplen; - enum PyUnicode_Kind mapkind; - void *mapdata; - Py_UCS4 x; - unsigned char ch; - if (PyUnicode_READY(mapping) == -1) - return NULL; + if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) { + /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1 + * is disabled in encoding aliases, latin1 is preferred because + * its implementation is faster. */ + Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata; + Py_UCS1 *outdata = (Py_UCS1 *)writer->data; + Py_UCS4 maxchar = writer->maxchar; - maplen = PyUnicode_GET_LENGTH(mapping); - mapdata = PyUnicode_DATA(mapping); - mapkind = PyUnicode_KIND(mapping); - - if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) { - /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1 - * is disabled in encoding aliases, latin1 is preferred because - * its implementation is faster. */ - Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata; - Py_UCS1 *outdata = (Py_UCS1 *)writer.data; - Py_UCS4 maxchar = writer.maxchar; - - assert (writer.kind == PyUnicode_1BYTE_KIND); - while (s < e) { - ch = *s; - x = mapdata_ucs1[ch]; - if (x > maxchar) { - if (_PyUnicodeWriter_Prepare(&writer, 1, 0xff) == -1) - goto onError; - maxchar = writer.maxchar; - outdata = (Py_UCS1 *)writer.data; - } - outdata[writer.pos] = x; - writer.pos++; - ++s; + assert (writer->kind == PyUnicode_1BYTE_KIND); + while (s < e) { + ch = *s; + x = mapdata_ucs1[ch]; + if (x > maxchar) { + if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1) + goto onError; + maxchar = writer->maxchar; + outdata = (Py_UCS1 *)writer->data; } + outdata[writer->pos] = x; + writer->pos++; + ++s; } + return 0; + } - while (s < e) { - if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) { - enum PyUnicode_Kind outkind = writer.kind; - Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata; - if (outkind == PyUnicode_1BYTE_KIND) { - Py_UCS1 *outdata = (Py_UCS1 *)writer.data; - Py_UCS4 maxchar = writer.maxchar; - while (s < e) { - ch = *s; - x = mapdata_ucs2[ch]; - if (x > maxchar) - goto Error; - outdata[writer.pos] = x; - writer.pos++; - ++s; - } - break; + while (s < e) { + if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) { + enum PyUnicode_Kind outkind = writer->kind; + Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata; + if (outkind == PyUnicode_1BYTE_KIND) { + Py_UCS1 *outdata = (Py_UCS1 *)writer->data; + Py_UCS4 maxchar = writer->maxchar; + while (s < e) { + ch = *s; + x = mapdata_ucs2[ch]; + if (x > maxchar) + goto Error; + outdata[writer->pos] = x; + writer->pos++; + ++s; } - else if (outkind == PyUnicode_2BYTE_KIND) { - Py_UCS2 *outdata = (Py_UCS2 *)writer.data; - while (s < e) { - ch = *s; - x = mapdata_ucs2[ch]; - if (x == 0xFFFE) - goto Error; - outdata[writer.pos] = x; - writer.pos++; - ++s; - } - break; + break; + } + else if (outkind == PyUnicode_2BYTE_KIND) { + Py_UCS2 *outdata = (Py_UCS2 *)writer->data; + while (s < e) { + ch = *s; + x = mapdata_ucs2[ch]; + if (x == 0xFFFE) + goto Error; + outdata[writer->pos] = x; + writer->pos++; + ++s; } + break; } - ch = *s; + } + ch = *s; - if (ch < maplen) - x = PyUnicode_READ(mapkind, mapdata, ch); - else - x = 0xfffe; /* invalid value */ + if (ch < maplen) + x = PyUnicode_READ(mapkind, mapdata, ch); + else + x = 0xfffe; /* invalid value */ Error: - if (x == 0xfffe) - { - /* undefined mapping */ - startinpos = s-starts; - endinpos = startinpos+1; - if (unicode_decode_call_errorhandler_writer( - errors, &errorHandler, - "charmap", "character maps to <undefined>", - &starts, &e, &startinpos, &endinpos, &exc, &s, - &writer)) { - goto onError; - } - continue; + if (x == 0xfffe) + { + /* undefined mapping */ + startinpos = s-starts; + endinpos = startinpos+1; + if (unicode_decode_call_errorhandler_writer( + errors, &errorHandler, + "charmap", "character maps to <undefined>", + &starts, &e, &startinpos, &endinpos, &exc, &s, + writer)) { + goto onError; } + continue; + } - if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) + if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0) + goto onError; + ++s; + } + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + return 0; + +onError: + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + return -1; +} + +static int +charmap_decode_mapping(const char *s, + Py_ssize_t size, + PyObject *mapping, + const char *errors, + _PyUnicodeWriter *writer) +{ + const char *starts = s; + const char *e; + Py_ssize_t startinpos, endinpos; + PyObject *errorHandler = NULL, *exc = NULL; + unsigned char ch; + PyObject *key, *item; + + e = s + size; + + while (s < e) { + ch = *s; + + /* Get mapping (char ordinal -> integer, Unicode char or None) */ + key = PyLong_FromLong((long)ch); + if (key == NULL) + goto onError; + + item = PyObject_GetItem(mapping, key); + Py_DECREF(key); + if (item == NULL) { + if (PyErr_ExceptionMatches(PyExc_LookupError)) { + /* No mapping found means: mapping is undefined. */ + PyErr_Clear(); + goto Undefined; + } else goto onError; - ++s; } - } - else { - while (s < e) { - unsigned char ch = *s; - PyObject *w, *x; - /* Get mapping (char ordinal -> integer, Unicode char or None) */ - w = PyLong_FromLong((long)ch); - if (w == NULL) + /* Apply mapping */ + if (item == Py_None) + goto Undefined; + if (PyLong_Check(item)) { + long value = PyLong_AS_LONG(item); + if (value == 0xFFFE) + goto Undefined; + if (value < 0 || value > MAX_UNICODE) { + PyErr_Format(PyExc_TypeError, + "character mapping must be in range(0x%lx)", + (unsigned long)MAX_UNICODE + 1); goto onError; - x = PyObject_GetItem(mapping, w); - Py_DECREF(w); - if (x == NULL) { - if (PyErr_ExceptionMatches(PyExc_LookupError)) { - /* No mapping found means: mapping is undefined. */ - PyErr_Clear(); - goto Undefined; - } else - goto onError; } - /* Apply mapping */ - if (x == Py_None) - goto Undefined; - if (PyLong_Check(x)) { - long value = PyLong_AS_LONG(x); + if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0) + goto onError; + } + else if (PyUnicode_Check(item)) { + if (PyUnicode_READY(item) == -1) + goto onError; + if (PyUnicode_GET_LENGTH(item) == 1) { + Py_UCS4 value = PyUnicode_READ_CHAR(item, 0); if (value == 0xFFFE) goto Undefined; - if (value < 0 || value > MAX_UNICODE) { - PyErr_Format(PyExc_TypeError, - "character mapping must be in range(0x%lx)", - (unsigned long)MAX_UNICODE + 1); - Py_DECREF(x); - goto onError; - } - - if (_PyUnicodeWriter_WriteCharInline(&writer, value) < 0) { - Py_DECREF(x); + if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0) goto onError; - } - } - else if (PyUnicode_Check(x)) { - if (PyUnicode_READY(x) == -1) { - Py_DECREF(x); - goto onError; - } - if (PyUnicode_GET_LENGTH(x) == 1) { - Py_UCS4 value = PyUnicode_READ_CHAR(x, 0); - if (value == 0xFFFE) - goto Undefined; - if (_PyUnicodeWriter_WriteCharInline(&writer, value) < 0) { - Py_DECREF(x); - goto onError; - } - } - else { - writer.overallocate = 1; - if (_PyUnicodeWriter_WriteStr(&writer, x) == -1) { - Py_DECREF(x); - goto onError; - } - } } else { - /* wrong return value */ - PyErr_SetString(PyExc_TypeError, - "character mapping must return integer, None or str"); - Py_DECREF(x); - goto onError; + writer->overallocate = 1; + if (_PyUnicodeWriter_WriteStr(writer, item) == -1) + goto onError; } - Py_DECREF(x); - ++s; - continue; + } + else { + /* wrong return value */ + PyErr_SetString(PyExc_TypeError, + "character mapping must return integer, None or str"); + goto onError; + } + Py_CLEAR(item); + ++s; + continue; + Undefined: - /* undefined mapping */ - Py_XDECREF(x); - startinpos = s-starts; - endinpos = startinpos+1; - if (unicode_decode_call_errorhandler_writer( - errors, &errorHandler, - "charmap", "character maps to <undefined>", - &starts, &e, &startinpos, &endinpos, &exc, &s, - &writer)) { - goto onError; - } + /* undefined mapping */ + Py_CLEAR(item); + startinpos = s-starts; + endinpos = startinpos+1; + if (unicode_decode_call_errorhandler_writer( + errors, &errorHandler, + "charmap", "character maps to <undefined>", + &starts, &e, &startinpos, &endinpos, &exc, &s, + writer)) { + goto onError; } } Py_XDECREF(errorHandler); Py_XDECREF(exc); - return _PyUnicodeWriter_Finish(&writer); + return 0; - onError: +onError: + Py_XDECREF(item); Py_XDECREF(errorHandler); Py_XDECREF(exc); + return -1; +} + +PyObject * +PyUnicode_DecodeCharmap(const char *s, + Py_ssize_t size, + PyObject *mapping, + const char *errors) +{ + _PyUnicodeWriter writer; + + /* Default to Latin-1 */ + if (mapping == NULL) + return PyUnicode_DecodeLatin1(s, size, errors); + + if (size == 0) + _Py_RETURN_UNICODE_EMPTY(); + _PyUnicodeWriter_Init(&writer); + writer.min_length = size; + if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) + goto onError; + + if (PyUnicode_CheckExact(mapping)) { + if (charmap_decode_string(s, size, mapping, errors, &writer) < 0) + goto onError; + } + else { + if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0) + goto onError; + } + return _PyUnicodeWriter_Finish(&writer); + + onError: _PyUnicodeWriter_Dealloc(&writer); return NULL; } |