summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/encodings/cp037.py1
-rw-r--r--Lib/encodings/cp500.py1
-rw-r--r--Lib/encodings/iso8859_1.py1
-rw-r--r--Objects/unicodeobject.c27
4 files changed, 26 insertions, 4 deletions
diff --git a/Lib/encodings/cp037.py b/Lib/encodings/cp037.py
index bfe2c1e..4edd708 100644
--- a/Lib/encodings/cp037.py
+++ b/Lib/encodings/cp037.py
@@ -301,7 +301,6 @@ decoding_table = (
'\xd9' # 0xFD -> LATIN CAPITAL LETTER U WITH GRAVE
'\xda' # 0xFE -> LATIN CAPITAL LETTER U WITH ACUTE
'\x9f' # 0xFF -> CONTROL
- '\ufffe' ## Widen to UCS2 for optimization
)
### Encoding table
diff --git a/Lib/encodings/cp500.py b/Lib/encodings/cp500.py
index a975be7..5f61535 100644
--- a/Lib/encodings/cp500.py
+++ b/Lib/encodings/cp500.py
@@ -301,7 +301,6 @@ decoding_table = (
'\xd9' # 0xFD -> LATIN CAPITAL LETTER U WITH GRAVE
'\xda' # 0xFE -> LATIN CAPITAL LETTER U WITH ACUTE
'\x9f' # 0xFF -> CONTROL
- '\ufffe' ## Widen to UCS2 for optimization
)
### Encoding table
diff --git a/Lib/encodings/iso8859_1.py b/Lib/encodings/iso8859_1.py
index d9cc516..8cfc01f 100644
--- a/Lib/encodings/iso8859_1.py
+++ b/Lib/encodings/iso8859_1.py
@@ -301,7 +301,6 @@ decoding_table = (
'\xfd' # 0xFD -> LATIN SMALL LETTER Y WITH ACUTE
'\xfe' # 0xFE -> LATIN SMALL LETTER THORN (Icelandic)
'\xff' # 0xFF -> LATIN SMALL LETTER Y WITH DIAERESIS
- '\ufffe' ## Widen to UCS2 for optimization
)
### Encoding table
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index e9153c0..88729c8 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -7281,6 +7281,7 @@ PyUnicode_DecodeCharmap(const char *s,
enum PyUnicode_Kind mapkind;
void *mapdata;
Py_UCS4 x;
+ unsigned char ch;
if (PyUnicode_READY(mapping) == -1)
return NULL;
@@ -7288,8 +7289,32 @@ PyUnicode_DecodeCharmap(const char *s,
maplen = PyUnicode_GET_LENGTH(mapping);
mapdata = PyUnicode_DATA(mapping);
mapkind = PyUnicode_KIND(mapping);
+
+ if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
+ /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
+ * is disabled in encoding aliases, latin1 is preferred because
+ * its implementation is faster. */
+ Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
+ Py_UCS1 *outdata = (Py_UCS1 *)writer.data;
+ Py_UCS4 maxchar = writer.maxchar;
+
+ assert (writer.kind == PyUnicode_1BYTE_KIND);
+ while (s < e) {
+ ch = *s;
+ x = mapdata_ucs1[ch];
+ if (x > maxchar) {
+ if (_PyUnicodeWriter_PrepareInternal(&writer, 1, 0xff) == -1)
+ goto onError;
+ maxchar = writer.maxchar;
+ outdata = (Py_UCS1 *)writer.data;
+ }
+ outdata[writer.pos] = x;
+ writer.pos++;
+ ++s;
+ }
+ }
+
while (s < e) {
- unsigned char ch;
if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
enum PyUnicode_Kind outkind = writer.kind;
void *outdata = writer.data;