diff options
author | Antoine Pitrou <solipsis@pitrou.net> | 2012-06-16 20:48:21 (GMT) |
---|---|---|
committer | Antoine Pitrou <solipsis@pitrou.net> | 2012-06-16 20:48:21 (GMT) |
commit | aaefac76ddac60bb332d28ec79702523b93530ee (patch) | |
tree | 747df780a77a130ac1197464d2e2830f5867ba7f /Objects/unicodeobject.c | |
parent | c62bd13cb203f27bc11d36903554f20386b5b8c5 (diff) | |
download | cpython-aaefac76ddac60bb332d28ec79702523b93530ee.zip cpython-aaefac76ddac60bb332d28ec79702523b93530ee.tar.gz cpython-aaefac76ddac60bb332d28ec79702523b93530ee.tar.bz2 |
Issue #14874: Restore charmap decoding speed to pre-PEP 393 levels.
Patch by Serhiy Storchaka.
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r-- | Objects/unicodeobject.c | 63 |
1 files changed, 48 insertions, 15 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index ce82717..353d2bb 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -7435,24 +7435,53 @@ PyUnicode_DecodeCharmap(const char *s, e = s + size; if (PyUnicode_CheckExact(mapping)) { Py_ssize_t maplen; - enum PyUnicode_Kind kind; - void *data; + enum PyUnicode_Kind mapkind; + void *mapdata; Py_UCS4 x; if (PyUnicode_READY(mapping) == -1) return NULL; maplen = PyUnicode_GET_LENGTH(mapping); - data = PyUnicode_DATA(mapping); - kind = PyUnicode_KIND(mapping); + mapdata = PyUnicode_DATA(mapping); + mapkind = PyUnicode_KIND(mapping); while (s < e) { - unsigned char ch = *s; + unsigned char ch; + if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) { + enum PyUnicode_Kind outkind = PyUnicode_KIND(v); + if (outkind == PyUnicode_1BYTE_KIND) { + void *outdata = PyUnicode_DATA(v); + Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v); + while (s < e) { + unsigned char ch = *s; + x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch); + if (x > maxchar) + goto Error; + PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x); + ++s; + } + break; + } + else if (outkind == PyUnicode_2BYTE_KIND) { + void *outdata = PyUnicode_DATA(v); + while (s < e) { + unsigned char ch = *s; + x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch); + if (x == 0xFFFE) + goto Error; + PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x); + ++s; + } + break; + } + } + ch = *s; if (ch < maplen) - x = PyUnicode_READ(kind, data, ch); + x = PyUnicode_READ(mapkind, mapdata, ch); else x = 0xfffe; /* invalid value */ - +Error: if (x == 0xfffe) { /* undefined mapping */ @@ -7667,14 +7696,17 @@ PyUnicode_BuildEncodingMap(PyObject* string) int count2 = 0, count3 = 0; int kind; void *data; + Py_ssize_t length; Py_UCS4 ch; - if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) { + if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) { PyErr_BadArgument(); return NULL; } kind = PyUnicode_KIND(string); data = PyUnicode_DATA(string); + length = PyUnicode_GET_LENGTH(string); + length = Py_MIN(length, 256); memset(level1, 0xFF, sizeof level1); memset(level2, 0xFF, sizeof level2); @@ -7683,7 +7715,7 @@ PyUnicode_BuildEncodingMap(PyObject* string) a mapping dictionary. */ if (PyUnicode_READ(kind, data, 0) != 0) need_dict = 1; - for (i = 1; i < 256; i++) { + for (i = 1; i < length; i++) { int l1, l2; ch = PyUnicode_READ(kind, data, i); if (ch == 0 || ch > 0xFFFF) { @@ -7709,7 +7741,7 @@ PyUnicode_BuildEncodingMap(PyObject* string) PyObject *key, *value; if (!result) return NULL; - for (i = 0; i < 256; i++) { + for (i = 0; i < length; i++) { key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); value = PyLong_FromLong(i); if (!key || !value) @@ -7743,17 +7775,18 @@ PyUnicode_BuildEncodingMap(PyObject* string) memset(mlevel2, 0xFF, 16*count2); memset(mlevel3, 0, 128*count3); count3 = 0; - for (i = 1; i < 256; i++) { + for (i = 1; i < length; i++) { int o1, o2, o3, i2, i3; - if (PyUnicode_READ(kind, data, i) == 0xFFFE) + Py_UCS4 ch = PyUnicode_READ(kind, data, i); + if (ch == 0xFFFE) /* unmapped character */ continue; - o1 = PyUnicode_READ(kind, data, i)>>11; - o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF; + o1 = ch>>11; + o2 = (ch>>7) & 0xF; i2 = 16*mlevel1[o1] + o2; if (mlevel2[i2] == 0xFF) mlevel2[i2] = count3++; - o3 = PyUnicode_READ(kind, data, i) & 0x7F; + o3 = ch & 0x7F; i3 = 128*mlevel2[i2] + o3; mlevel3[i3] = i; } |