From 03c3e35d42a2f9855fca4beb89e5cbbefe2d9c21 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 9 Apr 2013 21:53:09 +0200 Subject: Add fast-path in PyUnicode_DecodeCharmap() for pure 8 bit encodings: cp037, cp500 and iso8859_1 codecs --- Lib/encodings/cp037.py | 1 - Lib/encodings/cp500.py | 1 - Lib/encodings/iso8859_1.py | 1 - Objects/unicodeobject.c | 27 ++++++++++++++++++++++++++- 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/Lib/encodings/cp037.py b/Lib/encodings/cp037.py index bfe2c1e..4edd708 100644 --- a/Lib/encodings/cp037.py +++ b/Lib/encodings/cp037.py @@ -301,7 +301,6 @@ decoding_table = ( '\xd9' # 0xFD -> LATIN CAPITAL LETTER U WITH GRAVE '\xda' # 0xFE -> LATIN CAPITAL LETTER U WITH ACUTE '\x9f' # 0xFF -> CONTROL - '\ufffe' ## Widen to UCS2 for optimization ) ### Encoding table diff --git a/Lib/encodings/cp500.py b/Lib/encodings/cp500.py index a975be7..5f61535 100644 --- a/Lib/encodings/cp500.py +++ b/Lib/encodings/cp500.py @@ -301,7 +301,6 @@ decoding_table = ( '\xd9' # 0xFD -> LATIN CAPITAL LETTER U WITH GRAVE '\xda' # 0xFE -> LATIN CAPITAL LETTER U WITH ACUTE '\x9f' # 0xFF -> CONTROL - '\ufffe' ## Widen to UCS2 for optimization ) ### Encoding table diff --git a/Lib/encodings/iso8859_1.py b/Lib/encodings/iso8859_1.py index d9cc516..8cfc01f 100644 --- a/Lib/encodings/iso8859_1.py +++ b/Lib/encodings/iso8859_1.py @@ -301,7 +301,6 @@ decoding_table = ( '\xfd' # 0xFD -> LATIN SMALL LETTER Y WITH ACUTE '\xfe' # 0xFE -> LATIN SMALL LETTER THORN (Icelandic) '\xff' # 0xFF -> LATIN SMALL LETTER Y WITH DIAERESIS - '\ufffe' ## Widen to UCS2 for optimization ) ### Encoding table diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index e9153c0..88729c8 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -7281,6 +7281,7 @@ PyUnicode_DecodeCharmap(const char *s, enum PyUnicode_Kind mapkind; void *mapdata; Py_UCS4 x; + unsigned char ch; if (PyUnicode_READY(mapping) == -1) return NULL; @@ -7288,8 +7289,32 @@ PyUnicode_DecodeCharmap(const char *s, maplen = PyUnicode_GET_LENGTH(mapping); mapdata = PyUnicode_DATA(mapping); mapkind = PyUnicode_KIND(mapping); + + if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) { + /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1 + * is disabled in encoding aliases, latin1 is preferred because + * its implementation is faster. */ + Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata; + Py_UCS1 *outdata = (Py_UCS1 *)writer.data; + Py_UCS4 maxchar = writer.maxchar; + + assert (writer.kind == PyUnicode_1BYTE_KIND); + while (s < e) { + ch = *s; + x = mapdata_ucs1[ch]; + if (x > maxchar) { + if (_PyUnicodeWriter_PrepareInternal(&writer, 1, 0xff) == -1) + goto onError; + maxchar = writer.maxchar; + outdata = (Py_UCS1 *)writer.data; + } + outdata[writer.pos] = x; + writer.pos++; + ++s; + } + } + while (s < e) { - unsigned char ch; if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) { enum PyUnicode_Kind outkind = writer.kind; void *outdata = writer.data; -- cgit v0.12