From aaefac76ddac60bb332d28ec79702523b93530ee Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Sat, 16 Jun 2012 22:48:21 +0200 Subject: Issue #14874: Restore charmap decoding speed to pre-PEP 393 levels. Patch by Serhiy Storchaka. --- Lib/codecs.py | 5 +- Lib/encodings/cp037.py | 1 + Lib/encodings/cp500.py | 1 + Lib/encodings/hp_roman8.py | 377 ++++++++++++++++++++++++++++------------ Lib/encodings/iso8859_1.py | 1 + Lib/encodings/mac_latin2.py | 409 +++++++++++++++++++++++++++++--------------- Lib/encodings/palmos.py | 312 ++++++++++++++++++++++++++++----- Lib/encodings/ptcp154.py | 399 ++++++++++++++++++++++++++++-------------- Misc/NEWS | 3 + Objects/unicodeobject.c | 63 +++++-- Tools/unicode/gencodec.py | 9 +- 11 files changed, 1136 insertions(+), 444 deletions(-) diff --git a/Lib/codecs.py b/Lib/codecs.py index e63a0c6..9901d5c 100644 --- a/Lib/codecs.py +++ b/Lib/codecs.py @@ -1042,10 +1042,7 @@ def make_identity_dict(rng): mapped to themselves. """ - res = {} - for i in rng: - res[i]=i - return res + return {i:i for i in rng} def make_encoding_map(decoding_map): diff --git a/Lib/encodings/cp037.py b/Lib/encodings/cp037.py index 4edd708..bfe2c1e 100644 --- a/Lib/encodings/cp037.py +++ b/Lib/encodings/cp037.py @@ -301,6 +301,7 @@ decoding_table = ( '\xd9' # 0xFD -> LATIN CAPITAL LETTER U WITH GRAVE '\xda' # 0xFE -> LATIN CAPITAL LETTER U WITH ACUTE '\x9f' # 0xFF -> CONTROL + '\ufffe' ## Widen to UCS2 for optimization ) ### Encoding table diff --git a/Lib/encodings/cp500.py b/Lib/encodings/cp500.py index 5f61535..a975be7 100644 --- a/Lib/encodings/cp500.py +++ b/Lib/encodings/cp500.py @@ -301,6 +301,7 @@ decoding_table = ( '\xd9' # 0xFD -> LATIN CAPITAL LETTER U WITH GRAVE '\xda' # 0xFE -> LATIN CAPITAL LETTER U WITH ACUTE '\x9f' # 0xFF -> CONTROL + '\ufffe' ## Widen to UCS2 for optimization ) ### Encoding table diff --git a/Lib/encodings/hp_roman8.py b/Lib/encodings/hp_roman8.py index dbaaa72..a93523e 100644 --- a/Lib/encodings/hp_roman8.py +++ b/Lib/encodings/hp_roman8.py @@ -14,18 +14,18 @@ import codecs class Codec(codecs.Codec): def encode(self,input,errors='strict'): - return codecs.charmap_encode(input,errors,encoding_map) + return codecs.charmap_encode(input,errors,encoding_table) def decode(self,input,errors='strict'): - return codecs.charmap_decode(input,errors,decoding_map) + return codecs.charmap_decode(input,errors,decoding_table) class IncrementalEncoder(codecs.IncrementalEncoder): def encode(self, input, final=False): - return codecs.charmap_encode(input,self.errors,encoding_map)[0] + return codecs.charmap_encode(input,self.errors,encoding_table)[0] class IncrementalDecoder(codecs.IncrementalDecoder): def decode(self, input, final=False): - return codecs.charmap_decode(input,self.errors,decoding_map)[0] + return codecs.charmap_decode(input,self.errors,decoding_table)[0] class StreamWriter(Codec,codecs.StreamWriter): pass @@ -46,107 +46,268 @@ def getregentry(): streamreader=StreamReader, ) -### Decoding Map - -decoding_map = codecs.make_identity_dict(range(256)) -decoding_map.update({ - 0x00a1: 0x00c0, # LATIN CAPITAL LETTER A WITH GRAVE - 0x00a2: 0x00c2, # LATIN CAPITAL LETTER A WITH CIRCUMFLEX - 0x00a3: 0x00c8, # LATIN CAPITAL LETTER E WITH GRAVE - 0x00a4: 0x00ca, # LATIN CAPITAL LETTER E WITH CIRCUMFLEX - 0x00a5: 0x00cb, # LATIN CAPITAL LETTER E WITH DIAERESIS - 0x00a6: 0x00ce, # LATIN CAPITAL LETTER I WITH CIRCUMFLEX - 0x00a7: 0x00cf, # LATIN CAPITAL LETTER I WITH DIAERESIS - 0x00a8: 0x00b4, # ACUTE ACCENT - 0x00a9: 0x02cb, # MODIFIER LETTER GRAVE ACCENT (Mandarin Chinese fourth tone) - 0x00aa: 0x02c6, # MODIFIER LETTER CIRCUMFLEX ACCENT - 0x00ab: 0x00a8, # DIAERESIS - 0x00ac: 0x02dc, # SMALL TILDE - 0x00ad: 0x00d9, # LATIN CAPITAL LETTER U WITH GRAVE - 0x00ae: 0x00db, # LATIN CAPITAL LETTER U WITH CIRCUMFLEX - 0x00af: 0x20a4, # LIRA SIGN - 0x00b0: 0x00af, # MACRON - 0x00b1: 0x00dd, # LATIN CAPITAL LETTER Y WITH ACUTE - 0x00b2: 0x00fd, # LATIN SMALL LETTER Y WITH ACUTE - 0x00b3: 0x00b0, # DEGREE SIGN - 0x00b4: 0x00c7, # LATIN CAPITAL LETTER C WITH CEDILLA - 0x00b5: 0x00e7, # LATIN SMALL LETTER C WITH CEDILLA - 0x00b6: 0x00d1, # LATIN CAPITAL LETTER N WITH TILDE - 0x00b7: 0x00f1, # LATIN SMALL LETTER N WITH TILDE - 0x00b8: 0x00a1, # INVERTED EXCLAMATION MARK - 0x00b9: 0x00bf, # INVERTED QUESTION MARK - 0x00ba: 0x00a4, # CURRENCY SIGN - 0x00bb: 0x00a3, # POUND SIGN - 0x00bc: 0x00a5, # YEN SIGN - 0x00bd: 0x00a7, # SECTION SIGN - 0x00be: 0x0192, # LATIN SMALL LETTER F WITH HOOK - 0x00bf: 0x00a2, # CENT SIGN - 0x00c0: 0x00e2, # LATIN SMALL LETTER A WITH CIRCUMFLEX - 0x00c1: 0x00ea, # LATIN SMALL LETTER E WITH CIRCUMFLEX - 0x00c2: 0x00f4, # LATIN SMALL LETTER O WITH CIRCUMFLEX - 0x00c3: 0x00fb, # LATIN SMALL LETTER U WITH CIRCUMFLEX - 0x00c4: 0x00e1, # LATIN SMALL LETTER A WITH ACUTE - 0x00c5: 0x00e9, # LATIN SMALL LETTER E WITH ACUTE - 0x00c6: 0x00f3, # LATIN SMALL LETTER O WITH ACUTE - 0x00c7: 0x00fa, # LATIN SMALL LETTER U WITH ACUTE - 0x00c8: 0x00e0, # LATIN SMALL LETTER A WITH GRAVE - 0x00c9: 0x00e8, # LATIN SMALL LETTER E WITH GRAVE - 0x00ca: 0x00f2, # LATIN SMALL LETTER O WITH GRAVE - 0x00cb: 0x00f9, # LATIN SMALL LETTER U WITH GRAVE - 0x00cc: 0x00e4, # LATIN SMALL LETTER A WITH DIAERESIS - 0x00cd: 0x00eb, # LATIN SMALL LETTER E WITH DIAERESIS - 0x00ce: 0x00f6, # LATIN SMALL LETTER O WITH DIAERESIS - 0x00cf: 0x00fc, # LATIN SMALL LETTER U WITH DIAERESIS - 0x00d0: 0x00c5, # LATIN CAPITAL LETTER A WITH RING ABOVE - 0x00d1: 0x00ee, # LATIN SMALL LETTER I WITH CIRCUMFLEX - 0x00d2: 0x00d8, # LATIN CAPITAL LETTER O WITH STROKE - 0x00d3: 0x00c6, # LATIN CAPITAL LETTER AE - 0x00d4: 0x00e5, # LATIN SMALL LETTER A WITH RING ABOVE - 0x00d5: 0x00ed, # LATIN SMALL LETTER I WITH ACUTE - 0x00d6: 0x00f8, # LATIN SMALL LETTER O WITH STROKE - 0x00d7: 0x00e6, # LATIN SMALL LETTER AE - 0x00d8: 0x00c4, # LATIN CAPITAL LETTER A WITH DIAERESIS - 0x00d9: 0x00ec, # LATIN SMALL LETTER I WITH GRAVE - 0x00da: 0x00d6, # LATIN CAPITAL LETTER O WITH DIAERESIS - 0x00db: 0x00dc, # LATIN CAPITAL LETTER U WITH DIAERESIS - 0x00dc: 0x00c9, # LATIN CAPITAL LETTER E WITH ACUTE - 0x00dd: 0x00ef, # LATIN SMALL LETTER I WITH DIAERESIS - 0x00de: 0x00df, # LATIN SMALL LETTER SHARP S (German) - 0x00df: 0x00d4, # LATIN CAPITAL LETTER O WITH CIRCUMFLEX - 0x00e0: 0x00c1, # LATIN CAPITAL LETTER A WITH ACUTE - 0x00e1: 0x00c3, # LATIN CAPITAL LETTER A WITH TILDE - 0x00e2: 0x00e3, # LATIN SMALL LETTER A WITH TILDE - 0x00e3: 0x00d0, # LATIN CAPITAL LETTER ETH (Icelandic) - 0x00e4: 0x00f0, # LATIN SMALL LETTER ETH (Icelandic) - 0x00e5: 0x00cd, # LATIN CAPITAL LETTER I WITH ACUTE - 0x00e6: 0x00cc, # LATIN CAPITAL LETTER I WITH GRAVE - 0x00e7: 0x00d3, # LATIN CAPITAL LETTER O WITH ACUTE - 0x00e8: 0x00d2, # LATIN CAPITAL LETTER O WITH GRAVE - 0x00e9: 0x00d5, # LATIN CAPITAL LETTER O WITH TILDE - 0x00ea: 0x00f5, # LATIN SMALL LETTER O WITH TILDE - 0x00eb: 0x0160, # LATIN CAPITAL LETTER S WITH CARON - 0x00ec: 0x0161, # LATIN SMALL LETTER S WITH CARON - 0x00ed: 0x00da, # LATIN CAPITAL LETTER U WITH ACUTE - 0x00ee: 0x0178, # LATIN CAPITAL LETTER Y WITH DIAERESIS - 0x00ef: 0x00ff, # LATIN SMALL LETTER Y WITH DIAERESIS - 0x00f0: 0x00de, # LATIN CAPITAL LETTER THORN (Icelandic) - 0x00f1: 0x00fe, # LATIN SMALL LETTER THORN (Icelandic) - 0x00f2: 0x00b7, # MIDDLE DOT - 0x00f3: 0x00b5, # MICRO SIGN - 0x00f4: 0x00b6, # PILCROW SIGN - 0x00f5: 0x00be, # VULGAR FRACTION THREE QUARTERS - 0x00f6: 0x2014, # EM DASH - 0x00f7: 0x00bc, # VULGAR FRACTION ONE QUARTER - 0x00f8: 0x00bd, # VULGAR FRACTION ONE HALF - 0x00f9: 0x00aa, # FEMININE ORDINAL INDICATOR - 0x00fa: 0x00ba, # MASCULINE ORDINAL INDICATOR - 0x00fb: 0x00ab, # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK - 0x00fc: 0x25a0, # BLACK SQUARE - 0x00fd: 0x00bb, # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK - 0x00fe: 0x00b1, # PLUS-MINUS SIGN - 0x00ff: None, -}) - -### Encoding Map - -encoding_map = codecs.make_encoding_map(decoding_map) + +### Decoding Table + +decoding_table = ( + '\x00' # 0x00 -> NULL + '\x01' # 0x01 -> START OF HEADING + '\x02' # 0x02 -> START OF TEXT + '\x03' # 0x03 -> END OF TEXT + '\x04' # 0x04 -> END OF TRANSMISSION + '\x05' # 0x05 -> ENQUIRY + '\x06' # 0x06 -> ACKNOWLEDGE + '\x07' # 0x07 -> BELL + '\x08' # 0x08 -> BACKSPACE + '\t' # 0x09 -> HORIZONTAL TABULATION + '\n' # 0x0A -> LINE FEED + '\x0b' # 0x0B -> VERTICAL TABULATION + '\x0c' # 0x0C -> FORM FEED + '\r' # 0x0D -> CARRIAGE RETURN + '\x0e' # 0x0E -> SHIFT OUT + '\x0f' # 0x0F -> SHIFT IN + '\x10' # 0x10 -> DATA LINK ESCAPE + '\x11' # 0x11 -> DEVICE CONTROL ONE + '\x12' # 0x12 -> DEVICE CONTROL TWO + '\x13' # 0x13 -> DEVICE CONTROL THREE + '\x14' # 0x14 -> DEVICE CONTROL FOUR + '\x15' # 0x15 -> NEGATIVE ACKNOWLEDGE + '\x16' # 0x16 -> SYNCHRONOUS IDLE + '\x17' # 0x17 -> END OF TRANSMISSION BLOCK + '\x18' # 0x18 -> CANCEL + '\x19' # 0x19 -> END OF MEDIUM + '\x1a' # 0x1A -> SUBSTITUTE + '\x1b' # 0x1B -> ESCAPE + '\x1c' # 0x1C -> FILE SEPARATOR + '\x1d' # 0x1D -> GROUP SEPARATOR + '\x1e' # 0x1E -> RECORD SEPARATOR + '\x1f' # 0x1F -> UNIT SEPARATOR + ' ' # 0x20 -> SPACE + '!' # 0x21 -> EXCLAMATION MARK + '"' # 0x22 -> QUOTATION MARK + '#' # 0x23 -> NUMBER SIGN + '$' # 0x24 -> DOLLAR SIGN + '%' # 0x25 -> PERCENT SIGN + '&' # 0x26 -> AMPERSAND + "'" # 0x27 -> APOSTROPHE + '(' # 0x28 -> LEFT PARENTHESIS + ')' # 0x29 -> RIGHT PARENTHESIS + '*' # 0x2A -> ASTERISK + '+' # 0x2B -> PLUS SIGN + ',' # 0x2C -> COMMA + '-' # 0x2D -> HYPHEN-MINUS + '.' # 0x2E -> FULL STOP + '/' # 0x2F -> SOLIDUS + '0' # 0x30 -> DIGIT ZERO + '1' # 0x31 -> DIGIT ONE + '2' # 0x32 -> DIGIT TWO + '3' # 0x33 -> DIGIT THREE + '4' # 0x34 -> DIGIT FOUR + '5' # 0x35 -> DIGIT FIVE + '6' # 0x36 -> DIGIT SIX + '7' # 0x37 -> DIGIT SEVEN + '8' # 0x38 -> DIGIT EIGHT + '9' # 0x39 -> DIGIT NINE + ':' # 0x3A -> COLON + ';' # 0x3B -> SEMICOLON + '<' # 0x3C -> LESS-THAN SIGN + '=' # 0x3D -> EQUALS SIGN + '>' # 0x3E -> GREATER-THAN SIGN + '?' # 0x3F -> QUESTION MARK + '@' # 0x40 -> COMMERCIAL AT + 'A' # 0x41 -> LATIN CAPITAL LETTER A + 'B' # 0x42 -> LATIN CAPITAL LETTER B + 'C' # 0x43 -> LATIN CAPITAL LETTER C + 'D' # 0x44 -> LATIN CAPITAL LETTER D + 'E' # 0x45 -> LATIN CAPITAL LETTER E + 'F' # 0x46 -> LATIN CAPITAL LETTER F + 'G' # 0x47 -> LATIN CAPITAL LETTER G + 'H' # 0x48 -> LATIN CAPITAL LETTER H + 'I' # 0x49 -> LATIN CAPITAL LETTER I + 'J' # 0x4A -> LATIN CAPITAL LETTER J + 'K' # 0x4B -> LATIN CAPITAL LETTER K + 'L' # 0x4C -> LATIN CAPITAL LETTER L + 'M' # 0x4D -> LATIN CAPITAL LETTER M + 'N' # 0x4E -> LATIN CAPITAL LETTER N + 'O' # 0x4F -> LATIN CAPITAL LETTER O + 'P' # 0x50 -> LATIN CAPITAL LETTER P + 'Q' # 0x51 -> LATIN CAPITAL LETTER Q + 'R' # 0x52 -> LATIN CAPITAL LETTER R + 'S' # 0x53 -> LATIN CAPITAL LETTER S + 'T' # 0x54 -> LATIN CAPITAL LETTER T + 'U' # 0x55 -> LATIN CAPITAL LETTER U + 'V' # 0x56 -> LATIN CAPITAL LETTER V + 'W' # 0x57 -> LATIN CAPITAL LETTER W + 'X' # 0x58 -> LATIN CAPITAL LETTER X + 'Y' # 0x59 -> LATIN CAPITAL LETTER Y + 'Z' # 0x5A -> LATIN CAPITAL LETTER Z + '[' # 0x5B -> LEFT SQUARE BRACKET + '\\' # 0x5C -> REVERSE SOLIDUS + ']' # 0x5D -> RIGHT SQUARE BRACKET + '^' # 0x5E -> CIRCUMFLEX ACCENT + '_' # 0x5F -> LOW LINE + '`' # 0x60 -> GRAVE ACCENT + 'a' # 0x61 -> LATIN SMALL LETTER A + 'b' # 0x62 -> LATIN SMALL LETTER B + 'c' # 0x63 -> LATIN SMALL LETTER C + 'd' # 0x64 -> LATIN SMALL LETTER D + 'e' # 0x65 -> LATIN SMALL LETTER E + 'f' # 0x66 -> LATIN SMALL LETTER F + 'g' # 0x67 -> LATIN SMALL LETTER G + 'h' # 0x68 -> LATIN SMALL LETTER H + 'i' # 0x69 -> LATIN SMALL LETTER I + 'j' # 0x6A -> LATIN SMALL LETTER J + 'k' # 0x6B -> LATIN SMALL LETTER K + 'l' # 0x6C -> LATIN SMALL LETTER L + 'm' # 0x6D -> LATIN SMALL LETTER M + 'n' # 0x6E -> LATIN SMALL LETTER N + 'o' # 0x6F -> LATIN SMALL LETTER O + 'p' # 0x70 -> LATIN SMALL LETTER P + 'q' # 0x71 -> LATIN SMALL LETTER Q + 'r' # 0x72 -> LATIN SMALL LETTER R + 's' # 0x73 -> LATIN SMALL LETTER S + 't' # 0x74 -> LATIN SMALL LETTER T + 'u' # 0x75 -> LATIN SMALL LETTER U + 'v' # 0x76 -> LATIN SMALL LETTER V + 'w' # 0x77 -> LATIN SMALL LETTER W + 'x' # 0x78 -> LATIN SMALL LETTER X + 'y' # 0x79 -> LATIN SMALL LETTER Y + 'z' # 0x7A -> LATIN SMALL LETTER Z + '{' # 0x7B -> LEFT CURLY BRACKET + '|' # 0x7C -> VERTICAL LINE + '}' # 0x7D -> RIGHT CURLY BRACKET + '~' # 0x7E -> TILDE + '\x7f' # 0x7F -> DELETE + '\x80' # 0x80 -> + '\x81' # 0x81 -> + '\x82' # 0x82 -> + '\x83' # 0x83 -> + '\x84' # 0x84 -> + '\x85' # 0x85 -> + '\x86' # 0x86 -> + '\x87' # 0x87 -> + '\x88' # 0x88 -> + '\x89' # 0x89 -> + '\x8a' # 0x8A -> + '\x8b' # 0x8B -> + '\x8c' # 0x8C -> + '\x8d' # 0x8D -> + '\x8e' # 0x8E -> + '\x8f' # 0x8F -> + '\x90' # 0x90 -> + '\x91' # 0x91 -> + '\x92' # 0x92 -> + '\x93' # 0x93 -> + '\x94' # 0x94 -> + '\x95' # 0x95 -> + '\x96' # 0x96 -> + '\x97' # 0x97 -> + '\x98' # 0x98 -> + '\x99' # 0x99 -> + '\x9a' # 0x9A -> + '\x9b' # 0x9B -> + '\x9c' # 0x9C -> + '\x9d' # 0x9D -> + '\x9e' # 0x9E -> + '\x9f' # 0x9F -> + '\xa0' # 0xA0 -> NO-BREAK SPACE + '\xc0' # 0xA1 -> LATIN CAPITAL LETTER A WITH GRAVE + '\xc2' # 0xA2 -> LATIN CAPITAL LETTER A WITH CIRCUMFLEX + '\xc8' # 0xA3 -> LATIN CAPITAL LETTER E WITH GRAVE + '\xca' # 0xA4 -> LATIN CAPITAL LETTER E WITH CIRCUMFLEX + '\xcb' # 0xA5 -> LATIN CAPITAL LETTER E WITH DIAERESIS + '\xce' # 0xA6 -> LATIN CAPITAL LETTER I WITH CIRCUMFLEX + '\xcf' # 0xA7 -> LATIN CAPITAL LETTER I WITH DIAERESIS + '\xb4' # 0xA8 -> ACUTE ACCENT + '\u02cb' # 0xA9 -> MODIFIER LETTER GRAVE ACCENT (MANDARIN CHINESE FOURTH TONE) + '\u02c6' # 0xAA -> MODIFIER LETTER CIRCUMFLEX ACCENT + '\xa8' # 0xAB -> DIAERESIS + '\u02dc' # 0xAC -> SMALL TILDE + '\xd9' # 0xAD -> LATIN CAPITAL LETTER U WITH GRAVE + '\xdb' # 0xAE -> LATIN CAPITAL LETTER U WITH CIRCUMFLEX + '\u20a4' # 0xAF -> LIRA SIGN + '\xaf' # 0xB0 -> MACRON + '\xdd' # 0xB1 -> LATIN CAPITAL LETTER Y WITH ACUTE + '\xfd' # 0xB2 -> LATIN SMALL LETTER Y WITH ACUTE + '\xb0' # 0xB3 -> DEGREE SIGN + '\xc7' # 0xB4 -> LATIN CAPITAL LETTER C WITH CEDILLA + '\xe7' # 0xB5 -> LATIN SMALL LETTER C WITH CEDILLA + '\xd1' # 0xB6 -> LATIN CAPITAL LETTER N WITH TILDE + '\xf1' # 0xB7 -> LATIN SMALL LETTER N WITH TILDE + '\xa1' # 0xB8 -> INVERTED EXCLAMATION MARK + '\xbf' # 0xB9 -> INVERTED QUESTION MARK + '\xa4' # 0xBA -> CURRENCY SIGN + '\xa3' # 0xBB -> POUND SIGN + '\xa5' # 0xBC -> YEN SIGN + '\xa7' # 0xBD -> SECTION SIGN + '\u0192' # 0xBE -> LATIN SMALL LETTER F WITH HOOK + '\xa2' # 0xBF -> CENT SIGN + '\xe2' # 0xC0 -> LATIN SMALL LETTER A WITH CIRCUMFLEX + '\xea' # 0xC1 -> LATIN SMALL LETTER E WITH CIRCUMFLEX + '\xf4' # 0xC2 -> LATIN SMALL LETTER O WITH CIRCUMFLEX + '\xfb' # 0xC3 -> LATIN SMALL LETTER U WITH CIRCUMFLEX + '\xe1' # 0xC4 -> LATIN SMALL LETTER A WITH ACUTE + '\xe9' # 0xC5 -> LATIN SMALL LETTER E WITH ACUTE + '\xf3' # 0xC6 -> LATIN SMALL LETTER O WITH ACUTE + '\xfa' # 0xC7 -> LATIN SMALL LETTER U WITH ACUTE + '\xe0' # 0xC8 -> LATIN SMALL LETTER A WITH GRAVE + '\xe8' # 0xC9 -> LATIN SMALL LETTER E WITH GRAVE + '\xf2' # 0xCA -> LATIN SMALL LETTER O WITH GRAVE + '\xf9' # 0xCB -> LATIN SMALL LETTER U WITH GRAVE + '\xe4' # 0xCC -> LATIN SMALL LETTER A WITH DIAERESIS + '\xeb' # 0xCD -> LATIN SMALL LETTER E WITH DIAERESIS + '\xf6' # 0xCE -> LATIN SMALL LETTER O WITH DIAERESIS + '\xfc' # 0xCF -> LATIN SMALL LETTER U WITH DIAERESIS + '\xc5' # 0xD0 -> LATIN CAPITAL LETTER A WITH RING ABOVE + '\xee' # 0xD1 -> LATIN SMALL LETTER I WITH CIRCUMFLEX + '\xd8' # 0xD2 -> LATIN CAPITAL LETTER O WITH STROKE + '\xc6' # 0xD3 -> LATIN CAPITAL LETTER AE + '\xe5' # 0xD4 -> LATIN SMALL LETTER A WITH RING ABOVE + '\xed' # 0xD5 -> LATIN SMALL LETTER I WITH ACUTE + '\xf8' # 0xD6 -> LATIN SMALL LETTER O WITH STROKE + '\xe6' # 0xD7 -> LATIN SMALL LETTER AE + '\xc4' # 0xD8 -> LATIN CAPITAL LETTER A WITH DIAERESIS + '\xec' # 0xD9 -> LATIN SMALL LETTER I WITH GRAVE + '\xd6' # 0xDA -> LATIN CAPITAL LETTER O WITH DIAERESIS + '\xdc' # 0xDB -> LATIN CAPITAL LETTER U WITH DIAERESIS + '\xc9' # 0xDC -> LATIN CAPITAL LETTER E WITH ACUTE + '\xef' # 0xDD -> LATIN SMALL LETTER I WITH DIAERESIS + '\xdf' # 0xDE -> LATIN SMALL LETTER SHARP S (GERMAN) + '\xd4' # 0xDF -> LATIN CAPITAL LETTER O WITH CIRCUMFLEX + '\xc1' # 0xE0 -> LATIN CAPITAL LETTER A WITH ACUTE + '\xc3' # 0xE1 -> LATIN CAPITAL LETTER A WITH TILDE + '\xe3' # 0xE2 -> LATIN SMALL LETTER A WITH TILDE + '\xd0' # 0xE3 -> LATIN CAPITAL LETTER ETH (ICELANDIC) + '\xf0' # 0xE4 -> LATIN SMALL LETTER ETH (ICELANDIC) + '\xcd' # 0xE5 -> LATIN CAPITAL LETTER I WITH ACUTE + '\xcc' # 0xE6 -> LATIN CAPITAL LETTER I WITH GRAVE + '\xd3' # 0xE7 -> LATIN CAPITAL LETTER O WITH ACUTE + '\xd2' # 0xE8 -> LATIN CAPITAL LETTER O WITH GRAVE + '\xd5' # 0xE9 -> LATIN CAPITAL LETTER O WITH TILDE + '\xf5' # 0xEA -> LATIN SMALL LETTER O WITH TILDE + '\u0160' # 0xEB -> LATIN CAPITAL LETTER S WITH CARON + '\u0161' # 0xEC -> LATIN SMALL LETTER S WITH CARON + '\xda' # 0xED -> LATIN CAPITAL LETTER U WITH ACUTE + '\u0178' # 0xEE -> LATIN CAPITAL LETTER Y WITH DIAERESIS + '\xff' # 0xEF -> LATIN SMALL LETTER Y WITH DIAERESIS + '\xde' # 0xF0 -> LATIN CAPITAL LETTER THORN (ICELANDIC) + '\xfe' # 0xF1 -> LATIN SMALL LETTER THORN (ICELANDIC) + '\xb7' # 0xF2 -> MIDDLE DOT + '\xb5' # 0xF3 -> MICRO SIGN + '\xb6' # 0xF4 -> PILCROW SIGN + '\xbe' # 0xF5 -> VULGAR FRACTION THREE QUARTERS + '\u2014' # 0xF6 -> EM DASH + '\xbc' # 0xF7 -> VULGAR FRACTION ONE QUARTER + '\xbd' # 0xF8 -> VULGAR FRACTION ONE HALF + '\xaa' # 0xF9 -> FEMININE ORDINAL INDICATOR + '\xba' # 0xFA -> MASCULINE ORDINAL INDICATOR + '\xab' # 0xFB -> LEFT-POINTING DOUBLE ANGLE QUOTATION MARK + '\u25a0' # 0xFC -> BLACK SQUARE + '\xbb' # 0xFD -> RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK + '\xb1' # 0xFE -> PLUS-MINUS SIGN + '\ufffe' +) + +### Encoding table +encoding_table=codecs.charmap_build(decoding_table) + diff --git a/Lib/encodings/iso8859_1.py b/Lib/encodings/iso8859_1.py index 8cfc01f..d9cc516 100644 --- a/Lib/encodings/iso8859_1.py +++ b/Lib/encodings/iso8859_1.py @@ -301,6 +301,7 @@ decoding_table = ( '\xfd' # 0xFD -> LATIN SMALL LETTER Y WITH ACUTE '\xfe' # 0xFE -> LATIN SMALL LETTER THORN (Icelandic) '\xff' # 0xFF -> LATIN SMALL LETTER Y WITH DIAERESIS + '\ufffe' ## Widen to UCS2 for optimization ) ### Encoding table diff --git a/Lib/encodings/mac_latin2.py b/Lib/encodings/mac_latin2.py index e322be2..da9d4b1 100644 --- a/Lib/encodings/mac_latin2.py +++ b/Lib/encodings/mac_latin2.py @@ -1,4 +1,4 @@ -""" Python Character Mapping Codec generated from 'LATIN2.TXT' with gencodec.py. +""" Python Character Mapping Codec mac_latin2 generated from 'MAPPINGS/VENDORS/MICSFT/MAC/LATIN2.TXT' with gencodec.py. Written by Marc-Andre Lemburg (mal@lemburg.com). @@ -14,18 +14,18 @@ import codecs class Codec(codecs.Codec): def encode(self,input,errors='strict'): - return codecs.charmap_encode(input,errors,encoding_map) + return codecs.charmap_encode(input,errors,encoding_table) def decode(self,input,errors='strict'): - return codecs.charmap_decode(input,errors,decoding_map) + return codecs.charmap_decode(input,errors,decoding_table) class IncrementalEncoder(codecs.IncrementalEncoder): def encode(self, input, final=False): - return codecs.charmap_encode(input,self.errors,encoding_map)[0] + return codecs.charmap_encode(input,self.errors,encoding_table)[0] class IncrementalDecoder(codecs.IncrementalDecoder): def decode(self, input, final=False): - return codecs.charmap_decode(input,self.errors,decoding_map)[0] + return codecs.charmap_decode(input,self.errors,decoding_table)[0] class StreamWriter(Codec,codecs.StreamWriter): pass @@ -46,138 +46,267 @@ def getregentry(): streamwriter=StreamWriter, ) -### Decoding Map - -decoding_map = codecs.make_identity_dict(range(256)) -decoding_map.update({ - 0x0080: 0x00c4, # LATIN CAPITAL LETTER A WITH DIAERESIS - 0x0081: 0x0100, # LATIN CAPITAL LETTER A WITH MACRON - 0x0082: 0x0101, # LATIN SMALL LETTER A WITH MACRON - 0x0083: 0x00c9, # LATIN CAPITAL LETTER E WITH ACUTE - 0x0084: 0x0104, # LATIN CAPITAL LETTER A WITH OGONEK - 0x0085: 0x00d6, # LATIN CAPITAL LETTER O WITH DIAERESIS - 0x0086: 0x00dc, # LATIN CAPITAL LETTER U WITH DIAERESIS - 0x0087: 0x00e1, # LATIN SMALL LETTER A WITH ACUTE - 0x0088: 0x0105, # LATIN SMALL LETTER A WITH OGONEK - 0x0089: 0x010c, # LATIN CAPITAL LETTER C WITH CARON - 0x008a: 0x00e4, # LATIN SMALL LETTER A WITH DIAERESIS - 0x008b: 0x010d, # LATIN SMALL LETTER C WITH CARON - 0x008c: 0x0106, # LATIN CAPITAL LETTER C WITH ACUTE - 0x008d: 0x0107, # LATIN SMALL LETTER C WITH ACUTE - 0x008e: 0x00e9, # LATIN SMALL LETTER E WITH ACUTE - 0x008f: 0x0179, # LATIN CAPITAL LETTER Z WITH ACUTE - 0x0090: 0x017a, # LATIN SMALL LETTER Z WITH ACUTE - 0x0091: 0x010e, # LATIN CAPITAL LETTER D WITH CARON - 0x0092: 0x00ed, # LATIN SMALL LETTER I WITH ACUTE - 0x0093: 0x010f, # LATIN SMALL LETTER D WITH CARON - 0x0094: 0x0112, # LATIN CAPITAL LETTER E WITH MACRON - 0x0095: 0x0113, # LATIN SMALL LETTER E WITH MACRON - 0x0096: 0x0116, # LATIN CAPITAL LETTER E WITH DOT ABOVE - 0x0097: 0x00f3, # LATIN SMALL LETTER O WITH ACUTE - 0x0098: 0x0117, # LATIN SMALL LETTER E WITH DOT ABOVE - 0x0099: 0x00f4, # LATIN SMALL LETTER O WITH CIRCUMFLEX - 0x009a: 0x00f6, # LATIN SMALL LETTER O WITH DIAERESIS - 0x009b: 0x00f5, # LATIN SMALL LETTER O WITH TILDE - 0x009c: 0x00fa, # LATIN SMALL LETTER U WITH ACUTE - 0x009d: 0x011a, # LATIN CAPITAL LETTER E WITH CARON - 0x009e: 0x011b, # LATIN SMALL LETTER E WITH CARON - 0x009f: 0x00fc, # LATIN SMALL LETTER U WITH DIAERESIS - 0x00a0: 0x2020, # DAGGER - 0x00a1: 0x00b0, # DEGREE SIGN - 0x00a2: 0x0118, # LATIN CAPITAL LETTER E WITH OGONEK - 0x00a4: 0x00a7, # SECTION SIGN - 0x00a5: 0x2022, # BULLET - 0x00a6: 0x00b6, # PILCROW SIGN - 0x00a7: 0x00df, # LATIN SMALL LETTER SHARP S - 0x00a8: 0x00ae, # REGISTERED SIGN - 0x00aa: 0x2122, # TRADE MARK SIGN - 0x00ab: 0x0119, # LATIN SMALL LETTER E WITH OGONEK - 0x00ac: 0x00a8, # DIAERESIS - 0x00ad: 0x2260, # NOT EQUAL TO - 0x00ae: 0x0123, # LATIN SMALL LETTER G WITH CEDILLA - 0x00af: 0x012e, # LATIN CAPITAL LETTER I WITH OGONEK - 0x00b0: 0x012f, # LATIN SMALL LETTER I WITH OGONEK - 0x00b1: 0x012a, # LATIN CAPITAL LETTER I WITH MACRON - 0x00b2: 0x2264, # LESS-THAN OR EQUAL TO - 0x00b3: 0x2265, # GREATER-THAN OR EQUAL TO - 0x00b4: 0x012b, # LATIN SMALL LETTER I WITH MACRON - 0x00b5: 0x0136, # LATIN CAPITAL LETTER K WITH CEDILLA - 0x00b6: 0x2202, # PARTIAL DIFFERENTIAL - 0x00b7: 0x2211, # N-ARY SUMMATION - 0x00b8: 0x0142, # LATIN SMALL LETTER L WITH STROKE - 0x00b9: 0x013b, # LATIN CAPITAL LETTER L WITH CEDILLA - 0x00ba: 0x013c, # LATIN SMALL LETTER L WITH CEDILLA - 0x00bb: 0x013d, # LATIN CAPITAL LETTER L WITH CARON - 0x00bc: 0x013e, # LATIN SMALL LETTER L WITH CARON - 0x00bd: 0x0139, # LATIN CAPITAL LETTER L WITH ACUTE - 0x00be: 0x013a, # LATIN SMALL LETTER L WITH ACUTE - 0x00bf: 0x0145, # LATIN CAPITAL LETTER N WITH CEDILLA - 0x00c0: 0x0146, # LATIN SMALL LETTER N WITH CEDILLA - 0x00c1: 0x0143, # LATIN CAPITAL LETTER N WITH ACUTE - 0x00c2: 0x00ac, # NOT SIGN - 0x00c3: 0x221a, # SQUARE ROOT - 0x00c4: 0x0144, # LATIN SMALL LETTER N WITH ACUTE - 0x00c5: 0x0147, # LATIN CAPITAL LETTER N WITH CARON - 0x00c6: 0x2206, # INCREMENT - 0x00c7: 0x00ab, # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK - 0x00c8: 0x00bb, # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK - 0x00c9: 0x2026, # HORIZONTAL ELLIPSIS - 0x00ca: 0x00a0, # NO-BREAK SPACE - 0x00cb: 0x0148, # LATIN SMALL LETTER N WITH CARON - 0x00cc: 0x0150, # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE - 0x00cd: 0x00d5, # LATIN CAPITAL LETTER O WITH TILDE - 0x00ce: 0x0151, # LATIN SMALL LETTER O WITH DOUBLE ACUTE - 0x00cf: 0x014c, # LATIN CAPITAL LETTER O WITH MACRON - 0x00d0: 0x2013, # EN DASH - 0x00d1: 0x2014, # EM DASH - 0x00d2: 0x201c, # LEFT DOUBLE QUOTATION MARK - 0x00d3: 0x201d, # RIGHT DOUBLE QUOTATION MARK - 0x00d4: 0x2018, # LEFT SINGLE QUOTATION MARK - 0x00d5: 0x2019, # RIGHT SINGLE QUOTATION MARK - 0x00d6: 0x00f7, # DIVISION SIGN - 0x00d7: 0x25ca, # LOZENGE - 0x00d8: 0x014d, # LATIN SMALL LETTER O WITH MACRON - 0x00d9: 0x0154, # LATIN CAPITAL LETTER R WITH ACUTE - 0x00da: 0x0155, # LATIN SMALL LETTER R WITH ACUTE - 0x00db: 0x0158, # LATIN CAPITAL LETTER R WITH CARON - 0x00dc: 0x2039, # SINGLE LEFT-POINTING ANGLE QUOTATION MARK - 0x00dd: 0x203a, # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK - 0x00de: 0x0159, # LATIN SMALL LETTER R WITH CARON - 0x00df: 0x0156, # LATIN CAPITAL LETTER R WITH CEDILLA - 0x00e0: 0x0157, # LATIN SMALL LETTER R WITH CEDILLA - 0x00e1: 0x0160, # LATIN CAPITAL LETTER S WITH CARON - 0x00e2: 0x201a, # SINGLE LOW-9 QUOTATION MARK - 0x00e3: 0x201e, # DOUBLE LOW-9 QUOTATION MARK - 0x00e4: 0x0161, # LATIN SMALL LETTER S WITH CARON - 0x00e5: 0x015a, # LATIN CAPITAL LETTER S WITH ACUTE - 0x00e6: 0x015b, # LATIN SMALL LETTER S WITH ACUTE - 0x00e7: 0x00c1, # LATIN CAPITAL LETTER A WITH ACUTE - 0x00e8: 0x0164, # LATIN CAPITAL LETTER T WITH CARON - 0x00e9: 0x0165, # LATIN SMALL LETTER T WITH CARON - 0x00ea: 0x00cd, # LATIN CAPITAL LETTER I WITH ACUTE - 0x00eb: 0x017d, # LATIN CAPITAL LETTER Z WITH CARON - 0x00ec: 0x017e, # LATIN SMALL LETTER Z WITH CARON - 0x00ed: 0x016a, # LATIN CAPITAL LETTER U WITH MACRON - 0x00ee: 0x00d3, # LATIN CAPITAL LETTER O WITH ACUTE - 0x00ef: 0x00d4, # LATIN CAPITAL LETTER O WITH CIRCUMFLEX - 0x00f0: 0x016b, # LATIN SMALL LETTER U WITH MACRON - 0x00f1: 0x016e, # LATIN CAPITAL LETTER U WITH RING ABOVE - 0x00f2: 0x00da, # LATIN CAPITAL LETTER U WITH ACUTE - 0x00f3: 0x016f, # LATIN SMALL LETTER U WITH RING ABOVE - 0x00f4: 0x0170, # LATIN CAPITAL LETTER U WITH DOUBLE ACUTE - 0x00f5: 0x0171, # LATIN SMALL LETTER U WITH DOUBLE ACUTE - 0x00f6: 0x0172, # LATIN CAPITAL LETTER U WITH OGONEK - 0x00f7: 0x0173, # LATIN SMALL LETTER U WITH OGONEK - 0x00f8: 0x00dd, # LATIN CAPITAL LETTER Y WITH ACUTE - 0x00f9: 0x00fd, # LATIN SMALL LETTER Y WITH ACUTE - 0x00fa: 0x0137, # LATIN SMALL LETTER K WITH CEDILLA - 0x00fb: 0x017b, # LATIN CAPITAL LETTER Z WITH DOT ABOVE - 0x00fc: 0x0141, # LATIN CAPITAL LETTER L WITH STROKE - 0x00fd: 0x017c, # LATIN SMALL LETTER Z WITH DOT ABOVE - 0x00fe: 0x0122, # LATIN CAPITAL LETTER G WITH CEDILLA - 0x00ff: 0x02c7, # CARON -}) - -### Encoding Map - -encoding_map = codecs.make_encoding_map(decoding_map) + +### Decoding Table + +decoding_table = ( + '\x00' # 0x00 -> NULL + '\x01' # 0x01 -> START OF HEADING + '\x02' # 0x02 -> START OF TEXT + '\x03' # 0x03 -> END OF TEXT + '\x04' # 0x04 -> END OF TRANSMISSION + '\x05' # 0x05 -> ENQUIRY + '\x06' # 0x06 -> ACKNOWLEDGE + '\x07' # 0x07 -> BELL + '\x08' # 0x08 -> BACKSPACE + '\t' # 0x09 -> HORIZONTAL TABULATION + '\n' # 0x0A -> LINE FEED + '\x0b' # 0x0B -> VERTICAL TABULATION + '\x0c' # 0x0C -> FORM FEED + '\r' # 0x0D -> CARRIAGE RETURN + '\x0e' # 0x0E -> SHIFT OUT + '\x0f' # 0x0F -> SHIFT IN + '\x10' # 0x10 -> DATA LINK ESCAPE + '\x11' # 0x11 -> DEVICE CONTROL ONE + '\x12' # 0x12 -> DEVICE CONTROL TWO + '\x13' # 0x13 -> DEVICE CONTROL THREE + '\x14' # 0x14 -> DEVICE CONTROL FOUR + '\x15' # 0x15 -> NEGATIVE ACKNOWLEDGE + '\x16' # 0x16 -> SYNCHRONOUS IDLE + '\x17' # 0x17 -> END OF TRANSMISSION BLOCK + '\x18' # 0x18 -> CANCEL + '\x19' # 0x19 -> END OF MEDIUM + '\x1a' # 0x1A -> SUBSTITUTE + '\x1b' # 0x1B -> ESCAPE + '\x1c' # 0x1C -> FILE SEPARATOR + '\x1d' # 0x1D -> GROUP SEPARATOR + '\x1e' # 0x1E -> RECORD SEPARATOR + '\x1f' # 0x1F -> UNIT SEPARATOR + ' ' # 0x20 -> SPACE + '!' # 0x21 -> EXCLAMATION MARK + '"' # 0x22 -> QUOTATION MARK + '#' # 0x23 -> NUMBER SIGN + '$' # 0x24 -> DOLLAR SIGN + '%' # 0x25 -> PERCENT SIGN + '&' # 0x26 -> AMPERSAND + "'" # 0x27 -> APOSTROPHE + '(' # 0x28 -> LEFT PARENTHESIS + ')' # 0x29 -> RIGHT PARENTHESIS + '*' # 0x2A -> ASTERISK + '+' # 0x2B -> PLUS SIGN + ',' # 0x2C -> COMMA + '-' # 0x2D -> HYPHEN-MINUS + '.' # 0x2E -> FULL STOP + '/' # 0x2F -> SOLIDUS + '0' # 0x30 -> DIGIT ZERO + '1' # 0x31 -> DIGIT ONE + '2' # 0x32 -> DIGIT TWO + '3' # 0x33 -> DIGIT THREE + '4' # 0x34 -> DIGIT FOUR + '5' # 0x35 -> DIGIT FIVE + '6' # 0x36 -> DIGIT SIX + '7' # 0x37 -> DIGIT SEVEN + '8' # 0x38 -> DIGIT EIGHT + '9' # 0x39 -> DIGIT NINE + ':' # 0x3A -> COLON + ';' # 0x3B -> SEMICOLON + '<' # 0x3C -> LESS-THAN SIGN + '=' # 0x3D -> EQUALS SIGN + '>' # 0x3E -> GREATER-THAN SIGN + '?' # 0x3F -> QUESTION MARK + '@' # 0x40 -> COMMERCIAL AT + 'A' # 0x41 -> LATIN CAPITAL LETTER A + 'B' # 0x42 -> LATIN CAPITAL LETTER B + 'C' # 0x43 -> LATIN CAPITAL LETTER C + 'D' # 0x44 -> LATIN CAPITAL LETTER D + 'E' # 0x45 -> LATIN CAPITAL LETTER E + 'F' # 0x46 -> LATIN CAPITAL LETTER F + 'G' # 0x47 -> LATIN CAPITAL LETTER G + 'H' # 0x48 -> LATIN CAPITAL LETTER H + 'I' # 0x49 -> LATIN CAPITAL LETTER I + 'J' # 0x4A -> LATIN CAPITAL LETTER J + 'K' # 0x4B -> LATIN CAPITAL LETTER K + 'L' # 0x4C -> LATIN CAPITAL LETTER L + 'M' # 0x4D -> LATIN CAPITAL LETTER M + 'N' # 0x4E -> LATIN CAPITAL LETTER N + 'O' # 0x4F -> LATIN CAPITAL LETTER O + 'P' # 0x50 -> LATIN CAPITAL LETTER P + 'Q' # 0x51 -> LATIN CAPITAL LETTER Q + 'R' # 0x52 -> LATIN CAPITAL LETTER R + 'S' # 0x53 -> LATIN CAPITAL LETTER S + 'T' # 0x54 -> LATIN CAPITAL LETTER T + 'U' # 0x55 -> LATIN CAPITAL LETTER U + 'V' # 0x56 -> LATIN CAPITAL LETTER V + 'W' # 0x57 -> LATIN CAPITAL LETTER W + 'X' # 0x58 -> LATIN CAPITAL LETTER X + 'Y' # 0x59 -> LATIN CAPITAL LETTER Y + 'Z' # 0x5A -> LATIN CAPITAL LETTER Z + '[' # 0x5B -> LEFT SQUARE BRACKET + '\\' # 0x5C -> REVERSE SOLIDUS + ']' # 0x5D -> RIGHT SQUARE BRACKET + '^' # 0x5E -> CIRCUMFLEX ACCENT + '_' # 0x5F -> LOW LINE + '`' # 0x60 -> GRAVE ACCENT + 'a' # 0x61 -> LATIN SMALL LETTER A + 'b' # 0x62 -> LATIN SMALL LETTER B + 'c' # 0x63 -> LATIN SMALL LETTER C + 'd' # 0x64 -> LATIN SMALL LETTER D + 'e' # 0x65 -> LATIN SMALL LETTER E + 'f' # 0x66 -> LATIN SMALL LETTER F + 'g' # 0x67 -> LATIN SMALL LETTER G + 'h' # 0x68 -> LATIN SMALL LETTER H + 'i' # 0x69 -> LATIN SMALL LETTER I + 'j' # 0x6A -> LATIN SMALL LETTER J + 'k' # 0x6B -> LATIN SMALL LETTER K + 'l' # 0x6C -> LATIN SMALL LETTER L + 'm' # 0x6D -> LATIN SMALL LETTER M + 'n' # 0x6E -> LATIN SMALL LETTER N + 'o' # 0x6F -> LATIN SMALL LETTER O + 'p' # 0x70 -> LATIN SMALL LETTER P + 'q' # 0x71 -> LATIN SMALL LETTER Q + 'r' # 0x72 -> LATIN SMALL LETTER R + 's' # 0x73 -> LATIN SMALL LETTER S + 't' # 0x74 -> LATIN SMALL LETTER T + 'u' # 0x75 -> LATIN SMALL LETTER U + 'v' # 0x76 -> LATIN SMALL LETTER V + 'w' # 0x77 -> LATIN SMALL LETTER W + 'x' # 0x78 -> LATIN SMALL LETTER X + 'y' # 0x79 -> LATIN SMALL LETTER Y + 'z' # 0x7A -> LATIN SMALL LETTER Z + '{' # 0x7B -> LEFT CURLY BRACKET + '|' # 0x7C -> VERTICAL LINE + '}' # 0x7D -> RIGHT CURLY BRACKET + '~' # 0x7E -> TILDE + '\x7f' # 0x7F -> DELETE + '\xc4' # 0x80 -> LATIN CAPITAL LETTER A WITH DIAERESIS + '\u0100' # 0x81 -> LATIN CAPITAL LETTER A WITH MACRON + '\u0101' # 0x82 -> LATIN SMALL LETTER A WITH MACRON + '\xc9' # 0x83 -> LATIN CAPITAL LETTER E WITH ACUTE + '\u0104' # 0x84 -> LATIN CAPITAL LETTER A WITH OGONEK + '\xd6' # 0x85 -> LATIN CAPITAL LETTER O WITH DIAERESIS + '\xdc' # 0x86 -> LATIN CAPITAL LETTER U WITH DIAERESIS + '\xe1' # 0x87 -> LATIN SMALL LETTER A WITH ACUTE + '\u0105' # 0x88 -> LATIN SMALL LETTER A WITH OGONEK + '\u010c' # 0x89 -> LATIN CAPITAL LETTER C WITH CARON + '\xe4' # 0x8A -> LATIN SMALL LETTER A WITH DIAERESIS + '\u010d' # 0x8B -> LATIN SMALL LETTER C WITH CARON + '\u0106' # 0x8C -> LATIN CAPITAL LETTER C WITH ACUTE + '\u0107' # 0x8D -> LATIN SMALL LETTER C WITH ACUTE + '\xe9' # 0x8E -> LATIN SMALL LETTER E WITH ACUTE + '\u0179' # 0x8F -> LATIN CAPITAL LETTER Z WITH ACUTE + '\u017a' # 0x90 -> LATIN SMALL LETTER Z WITH ACUTE + '\u010e' # 0x91 -> LATIN CAPITAL LETTER D WITH CARON + '\xed' # 0x92 -> LATIN SMALL LETTER I WITH ACUTE + '\u010f' # 0x93 -> LATIN SMALL LETTER D WITH CARON + '\u0112' # 0x94 -> LATIN CAPITAL LETTER E WITH MACRON + '\u0113' # 0x95 -> LATIN SMALL LETTER E WITH MACRON + '\u0116' # 0x96 -> LATIN CAPITAL LETTER E WITH DOT ABOVE + '\xf3' # 0x97 -> LATIN SMALL LETTER O WITH ACUTE + '\u0117' # 0x98 -> LATIN SMALL LETTER E WITH DOT ABOVE + '\xf4' # 0x99 -> LATIN SMALL LETTER O WITH CIRCUMFLEX + '\xf6' # 0x9A -> LATIN SMALL LETTER O WITH DIAERESIS + '\xf5' # 0x9B -> LATIN SMALL LETTER O WITH TILDE + '\xfa' # 0x9C -> LATIN SMALL LETTER U WITH ACUTE + '\u011a' # 0x9D -> LATIN CAPITAL LETTER E WITH CARON + '\u011b' # 0x9E -> LATIN SMALL LETTER E WITH CARON + '\xfc' # 0x9F -> LATIN SMALL LETTER U WITH DIAERESIS + '\u2020' # 0xA0 -> DAGGER + '\xb0' # 0xA1 -> DEGREE SIGN + '\u0118' # 0xA2 -> LATIN CAPITAL LETTER E WITH OGONEK + '\xa3' # 0xA3 -> POUND SIGN + '\xa7' # 0xA4 -> SECTION SIGN + '\u2022' # 0xA5 -> BULLET + '\xb6' # 0xA6 -> PILCROW SIGN + '\xdf' # 0xA7 -> LATIN SMALL LETTER SHARP S + '\xae' # 0xA8 -> REGISTERED SIGN + '\xa9' # 0xA9 -> COPYRIGHT SIGN + '\u2122' # 0xAA -> TRADE MARK SIGN + '\u0119' # 0xAB -> LATIN SMALL LETTER E WITH OGONEK + '\xa8' # 0xAC -> DIAERESIS + '\u2260' # 0xAD -> NOT EQUAL TO + '\u0123' # 0xAE -> LATIN SMALL LETTER G WITH CEDILLA + '\u012e' # 0xAF -> LATIN CAPITAL LETTER I WITH OGONEK + '\u012f' # 0xB0 -> LATIN SMALL LETTER I WITH OGONEK + '\u012a' # 0xB1 -> LATIN CAPITAL LETTER I WITH MACRON + '\u2264' # 0xB2 -> LESS-THAN OR EQUAL TO + '\u2265' # 0xB3 -> GREATER-THAN OR EQUAL TO + '\u012b' # 0xB4 -> LATIN SMALL LETTER I WITH MACRON + '\u0136' # 0xB5 -> LATIN CAPITAL LETTER K WITH CEDILLA + '\u2202' # 0xB6 -> PARTIAL DIFFERENTIAL + '\u2211' # 0xB7 -> N-ARY SUMMATION + '\u0142' # 0xB8 -> LATIN SMALL LETTER L WITH STROKE + '\u013b' # 0xB9 -> LATIN CAPITAL LETTER L WITH CEDILLA + '\u013c' # 0xBA -> LATIN SMALL LETTER L WITH CEDILLA + '\u013d' # 0xBB -> LATIN CAPITAL LETTER L WITH CARON + '\u013e' # 0xBC -> LATIN SMALL LETTER L WITH CARON + '\u0139' # 0xBD -> LATIN CAPITAL LETTER L WITH ACUTE + '\u013a' # 0xBE -> LATIN SMALL LETTER L WITH ACUTE + '\u0145' # 0xBF -> LATIN CAPITAL LETTER N WITH CEDILLA + '\u0146' # 0xC0 -> LATIN SMALL LETTER N WITH CEDILLA + '\u0143' # 0xC1 -> LATIN CAPITAL LETTER N WITH ACUTE + '\xac' # 0xC2 -> NOT SIGN + '\u221a' # 0xC3 -> SQUARE ROOT + '\u0144' # 0xC4 -> LATIN SMALL LETTER N WITH ACUTE + '\u0147' # 0xC5 -> LATIN CAPITAL LETTER N WITH CARON + '\u2206' # 0xC6 -> INCREMENT + '\xab' # 0xC7 -> LEFT-POINTING DOUBLE ANGLE QUOTATION MARK + '\xbb' # 0xC8 -> RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK + '\u2026' # 0xC9 -> HORIZONTAL ELLIPSIS + '\xa0' # 0xCA -> NO-BREAK SPACE + '\u0148' # 0xCB -> LATIN SMALL LETTER N WITH CARON + '\u0150' # 0xCC -> LATIN CAPITAL LETTER O WITH DOUBLE ACUTE + '\xd5' # 0xCD -> LATIN CAPITAL LETTER O WITH TILDE + '\u0151' # 0xCE -> LATIN SMALL LETTER O WITH DOUBLE ACUTE + '\u014c' # 0xCF -> LATIN CAPITAL LETTER O WITH MACRON + '\u2013' # 0xD0 -> EN DASH + '\u2014' # 0xD1 -> EM DASH + '\u201c' # 0xD2 -> LEFT DOUBLE QUOTATION MARK + '\u201d' # 0xD3 -> RIGHT DOUBLE QUOTATION MARK + '\u2018' # 0xD4 -> LEFT SINGLE QUOTATION MARK + '\u2019' # 0xD5 -> RIGHT SINGLE QUOTATION MARK + '\xf7' # 0xD6 -> DIVISION SIGN + '\u25ca' # 0xD7 -> LOZENGE + '\u014d' # 0xD8 -> LATIN SMALL LETTER O WITH MACRON + '\u0154' # 0xD9 -> LATIN CAPITAL LETTER R WITH ACUTE + '\u0155' # 0xDA -> LATIN SMALL LETTER R WITH ACUTE + '\u0158' # 0xDB -> LATIN CAPITAL LETTER R WITH CARON + '\u2039' # 0xDC -> SINGLE LEFT-POINTING ANGLE QUOTATION MARK + '\u203a' # 0xDD -> SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + '\u0159' # 0xDE -> LATIN SMALL LETTER R WITH CARON + '\u0156' # 0xDF -> LATIN CAPITAL LETTER R WITH CEDILLA + '\u0157' # 0xE0 -> LATIN SMALL LETTER R WITH CEDILLA + '\u0160' # 0xE1 -> LATIN CAPITAL LETTER S WITH CARON + '\u201a' # 0xE2 -> SINGLE LOW-9 QUOTATION MARK + '\u201e' # 0xE3 -> DOUBLE LOW-9 QUOTATION MARK + '\u0161' # 0xE4 -> LATIN SMALL LETTER S WITH CARON + '\u015a' # 0xE5 -> LATIN CAPITAL LETTER S WITH ACUTE + '\u015b' # 0xE6 -> LATIN SMALL LETTER S WITH ACUTE + '\xc1' # 0xE7 -> LATIN CAPITAL LETTER A WITH ACUTE + '\u0164' # 0xE8 -> LATIN CAPITAL LETTER T WITH CARON + '\u0165' # 0xE9 -> LATIN SMALL LETTER T WITH CARON + '\xcd' # 0xEA -> LATIN CAPITAL LETTER I WITH ACUTE + '\u017d' # 0xEB -> LATIN CAPITAL LETTER Z WITH CARON + '\u017e' # 0xEC -> LATIN SMALL LETTER Z WITH CARON + '\u016a' # 0xED -> LATIN CAPITAL LETTER U WITH MACRON + '\xd3' # 0xEE -> LATIN CAPITAL LETTER O WITH ACUTE + '\xd4' # 0xEF -> LATIN CAPITAL LETTER O WITH CIRCUMFLEX + '\u016b' # 0xF0 -> LATIN SMALL LETTER U WITH MACRON + '\u016e' # 0xF1 -> LATIN CAPITAL LETTER U WITH RING ABOVE + '\xda' # 0xF2 -> LATIN CAPITAL LETTER U WITH ACUTE + '\u016f' # 0xF3 -> LATIN SMALL LETTER U WITH RING ABOVE + '\u0170' # 0xF4 -> LATIN CAPITAL LETTER U WITH DOUBLE ACUTE + '\u0171' # 0xF5 -> LATIN SMALL LETTER U WITH DOUBLE ACUTE + '\u0172' # 0xF6 -> LATIN CAPITAL LETTER U WITH OGONEK + '\u0173' # 0xF7 -> LATIN SMALL LETTER U WITH OGONEK + '\xdd' # 0xF8 -> LATIN CAPITAL LETTER Y WITH ACUTE + '\xfd' # 0xF9 -> LATIN SMALL LETTER Y WITH ACUTE + '\u0137' # 0xFA -> LATIN SMALL LETTER K WITH CEDILLA + '\u017b' # 0xFB -> LATIN CAPITAL LETTER Z WITH DOT ABOVE + '\u0141' # 0xFC -> LATIN CAPITAL LETTER L WITH STROKE + '\u017c' # 0xFD -> LATIN SMALL LETTER Z WITH DOT ABOVE + '\u0122' # 0xFE -> LATIN CAPITAL LETTER G WITH CEDILLA + '\u02c7' # 0xFF -> CARON +) + +### Encoding table +encoding_table=codecs.charmap_build(decoding_table) diff --git a/Lib/encodings/palmos.py b/Lib/encodings/palmos.py index 4b77e2b..ae19752 100644 --- a/Lib/encodings/palmos.py +++ b/Lib/encodings/palmos.py @@ -10,18 +10,18 @@ import codecs class Codec(codecs.Codec): def encode(self,input,errors='strict'): - return codecs.charmap_encode(input,errors,encoding_map) + return codecs.charmap_encode(input,errors,encoding_table) def decode(self,input,errors='strict'): - return codecs.charmap_decode(input,errors,decoding_map) + return codecs.charmap_decode(input,errors,decoding_table) class IncrementalEncoder(codecs.IncrementalEncoder): def encode(self, input, final=False): - return codecs.charmap_encode(input,self.errors,encoding_map)[0] + return codecs.charmap_encode(input,self.errors,encoding_table)[0] class IncrementalDecoder(codecs.IncrementalDecoder): def decode(self, input, final=False): - return codecs.charmap_decode(input,self.errors,decoding_map)[0] + return codecs.charmap_decode(input,self.errors,decoding_table)[0] class StreamWriter(Codec,codecs.StreamWriter): pass @@ -42,42 +42,268 @@ def getregentry(): streamwriter=StreamWriter, ) -### Decoding Map - -decoding_map = codecs.make_identity_dict(range(256)) - -# The PalmOS character set is mostly iso-8859-1 with some differences. -decoding_map.update({ - 0x0080: 0x20ac, # EURO SIGN - 0x0082: 0x201a, # SINGLE LOW-9 QUOTATION MARK - 0x0083: 0x0192, # LATIN SMALL LETTER F WITH HOOK - 0x0084: 0x201e, # DOUBLE LOW-9 QUOTATION MARK - 0x0085: 0x2026, # HORIZONTAL ELLIPSIS - 0x0086: 0x2020, # DAGGER - 0x0087: 0x2021, # DOUBLE DAGGER - 0x0088: 0x02c6, # MODIFIER LETTER CIRCUMFLEX ACCENT - 0x0089: 0x2030, # PER MILLE SIGN - 0x008a: 0x0160, # LATIN CAPITAL LETTER S WITH CARON - 0x008b: 0x2039, # SINGLE LEFT-POINTING ANGLE QUOTATION MARK - 0x008c: 0x0152, # LATIN CAPITAL LIGATURE OE - 0x008d: 0x2666, # BLACK DIAMOND SUIT - 0x008e: 0x2663, # BLACK CLUB SUIT - 0x008f: 0x2665, # BLACK HEART SUIT - 0x0090: 0x2660, # BLACK SPADE SUIT - 0x0091: 0x2018, # LEFT SINGLE QUOTATION MARK - 0x0092: 0x2019, # RIGHT SINGLE QUOTATION MARK - 0x0093: 0x201c, # LEFT DOUBLE QUOTATION MARK - 0x0094: 0x201d, # RIGHT DOUBLE QUOTATION MARK - 0x0095: 0x2022, # BULLET - 0x0096: 0x2013, # EN DASH - 0x0097: 0x2014, # EM DASH - 0x0098: 0x02dc, # SMALL TILDE - 0x0099: 0x2122, # TRADE MARK SIGN - 0x009a: 0x0161, # LATIN SMALL LETTER S WITH CARON - 0x009c: 0x0153, # LATIN SMALL LIGATURE OE - 0x009f: 0x0178, # LATIN CAPITAL LETTER Y WITH DIAERESIS -}) - -### Encoding Map - -encoding_map = codecs.make_encoding_map(decoding_map) + +### Decoding Table + +decoding_table = ( + '\x00' # 0x00 -> NULL + '\x01' # 0x01 -> START OF HEADING + '\x02' # 0x02 -> START OF TEXT + '\x03' # 0x03 -> END OF TEXT + '\x04' # 0x04 -> END OF TRANSMISSION + '\x05' # 0x05 -> ENQUIRY + '\x06' # 0x06 -> ACKNOWLEDGE + '\x07' # 0x07 -> BELL + '\x08' # 0x08 -> BACKSPACE + '\t' # 0x09 -> HORIZONTAL TABULATION + '\n' # 0x0A -> LINE FEED + '\x0b' # 0x0B -> VERTICAL TABULATION + '\x0c' # 0x0C -> FORM FEED + '\r' # 0x0D -> CARRIAGE RETURN + '\x0e' # 0x0E -> SHIFT OUT + '\x0f' # 0x0F -> SHIFT IN + '\x10' # 0x10 -> DATA LINK ESCAPE + '\x11' # 0x11 -> DEVICE CONTROL ONE + '\x12' # 0x12 -> DEVICE CONTROL TWO + '\x13' # 0x13 -> DEVICE CONTROL THREE + '\x14' # 0x14 -> DEVICE CONTROL FOUR + '\x15' # 0x15 -> NEGATIVE ACKNOWLEDGE + '\x16' # 0x16 -> SYNCHRONOUS IDLE + '\x17' # 0x17 -> END OF TRANSMISSION BLOCK + '\x18' # 0x18 -> CANCEL + '\x19' # 0x19 -> END OF MEDIUM + '\x1a' # 0x1A -> SUBSTITUTE + '\x1b' # 0x1B -> ESCAPE + '\x1c' # 0x1C -> FILE SEPARATOR + '\x1d' # 0x1D -> GROUP SEPARATOR + '\x1e' # 0x1E -> RECORD SEPARATOR + '\x1f' # 0x1F -> UNIT SEPARATOR + ' ' # 0x20 -> SPACE + '!' # 0x21 -> EXCLAMATION MARK + '"' # 0x22 -> QUOTATION MARK + '#' # 0x23 -> NUMBER SIGN + '$' # 0x24 -> DOLLAR SIGN + '%' # 0x25 -> PERCENT SIGN + '&' # 0x26 -> AMPERSAND + "'" # 0x27 -> APOSTROPHE + '(' # 0x28 -> LEFT PARENTHESIS + ')' # 0x29 -> RIGHT PARENTHESIS + '*' # 0x2A -> ASTERISK + '+' # 0x2B -> PLUS SIGN + ',' # 0x2C -> COMMA + '-' # 0x2D -> HYPHEN-MINUS + '.' # 0x2E -> FULL STOP + '/' # 0x2F -> SOLIDUS + '0' # 0x30 -> DIGIT ZERO + '1' # 0x31 -> DIGIT ONE + '2' # 0x32 -> DIGIT TWO + '3' # 0x33 -> DIGIT THREE + '4' # 0x34 -> DIGIT FOUR + '5' # 0x35 -> DIGIT FIVE + '6' # 0x36 -> DIGIT SIX + '7' # 0x37 -> DIGIT SEVEN + '8' # 0x38 -> DIGIT EIGHT + '9' # 0x39 -> DIGIT NINE + ':' # 0x3A -> COLON + ';' # 0x3B -> SEMICOLON + '<' # 0x3C -> LESS-THAN SIGN + '=' # 0x3D -> EQUALS SIGN + '>' # 0x3E -> GREATER-THAN SIGN + '?' # 0x3F -> QUESTION MARK + '@' # 0x40 -> COMMERCIAL AT + 'A' # 0x41 -> LATIN CAPITAL LETTER A + 'B' # 0x42 -> LATIN CAPITAL LETTER B + 'C' # 0x43 -> LATIN CAPITAL LETTER C + 'D' # 0x44 -> LATIN CAPITAL LETTER D + 'E' # 0x45 -> LATIN CAPITAL LETTER E + 'F' # 0x46 -> LATIN CAPITAL LETTER F + 'G' # 0x47 -> LATIN CAPITAL LETTER G + 'H' # 0x48 -> LATIN CAPITAL LETTER H + 'I' # 0x49 -> LATIN CAPITAL LETTER I + 'J' # 0x4A -> LATIN CAPITAL LETTER J + 'K' # 0x4B -> LATIN CAPITAL LETTER K + 'L' # 0x4C -> LATIN CAPITAL LETTER L + 'M' # 0x4D -> LATIN CAPITAL LETTER M + 'N' # 0x4E -> LATIN CAPITAL LETTER N + 'O' # 0x4F -> LATIN CAPITAL LETTER O + 'P' # 0x50 -> LATIN CAPITAL LETTER P + 'Q' # 0x51 -> LATIN CAPITAL LETTER Q + 'R' # 0x52 -> LATIN CAPITAL LETTER R + 'S' # 0x53 -> LATIN CAPITAL LETTER S + 'T' # 0x54 -> LATIN CAPITAL LETTER T + 'U' # 0x55 -> LATIN CAPITAL LETTER U + 'V' # 0x56 -> LATIN CAPITAL LETTER V + 'W' # 0x57 -> LATIN CAPITAL LETTER W + 'X' # 0x58 -> LATIN CAPITAL LETTER X + 'Y' # 0x59 -> LATIN CAPITAL LETTER Y + 'Z' # 0x5A -> LATIN CAPITAL LETTER Z + '[' # 0x5B -> LEFT SQUARE BRACKET + '\\' # 0x5C -> REVERSE SOLIDUS + ']' # 0x5D -> RIGHT SQUARE BRACKET + '^' # 0x5E -> CIRCUMFLEX ACCENT + '_' # 0x5F -> LOW LINE + '`' # 0x60 -> GRAVE ACCENT + 'a' # 0x61 -> LATIN SMALL LETTER A + 'b' # 0x62 -> LATIN SMALL LETTER B + 'c' # 0x63 -> LATIN SMALL LETTER C + 'd' # 0x64 -> LATIN SMALL LETTER D + 'e' # 0x65 -> LATIN SMALL LETTER E + 'f' # 0x66 -> LATIN SMALL LETTER F + 'g' # 0x67 -> LATIN SMALL LETTER G + 'h' # 0x68 -> LATIN SMALL LETTER H + 'i' # 0x69 -> LATIN SMALL LETTER I + 'j' # 0x6A -> LATIN SMALL LETTER J + 'k' # 0x6B -> LATIN SMALL LETTER K + 'l' # 0x6C -> LATIN SMALL LETTER L + 'm' # 0x6D -> LATIN SMALL LETTER M + 'n' # 0x6E -> LATIN SMALL LETTER N + 'o' # 0x6F -> LATIN SMALL LETTER O + 'p' # 0x70 -> LATIN SMALL LETTER P + 'q' # 0x71 -> LATIN SMALL LETTER Q + 'r' # 0x72 -> LATIN SMALL LETTER R + 's' # 0x73 -> LATIN SMALL LETTER S + 't' # 0x74 -> LATIN SMALL LETTER T + 'u' # 0x75 -> LATIN SMALL LETTER U + 'v' # 0x76 -> LATIN SMALL LETTER V + 'w' # 0x77 -> LATIN SMALL LETTER W + 'x' # 0x78 -> LATIN SMALL LETTER X + 'y' # 0x79 -> LATIN SMALL LETTER Y + 'z' # 0x7A -> LATIN SMALL LETTER Z + '{' # 0x7B -> LEFT CURLY BRACKET + '|' # 0x7C -> VERTICAL LINE + '}' # 0x7D -> RIGHT CURLY BRACKET + '~' # 0x7E -> TILDE + '\x7f' # 0x7F -> DELETE + '\u20ac' # 0x80 -> EURO SIGN + '\x81' # 0x81 -> + '\u201a' # 0x82 -> SINGLE LOW-9 QUOTATION MARK + '\u0192' # 0x83 -> LATIN SMALL LETTER F WITH HOOK + '\u201e' # 0x84 -> DOUBLE LOW-9 QUOTATION MARK + '\u2026' # 0x85 -> HORIZONTAL ELLIPSIS + '\u2020' # 0x86 -> DAGGER + '\u2021' # 0x87 -> DOUBLE DAGGER + '\u02c6' # 0x88 -> MODIFIER LETTER CIRCUMFLEX ACCENT + '\u2030' # 0x89 -> PER MILLE SIGN + '\u0160' # 0x8A -> LATIN CAPITAL LETTER S WITH CARON + '\u2039' # 0x8B -> SINGLE LEFT-POINTING ANGLE QUOTATION MARK + '\u0152' # 0x8C -> LATIN CAPITAL LIGATURE OE + '\u2666' # 0x8D -> BLACK DIAMOND SUIT + '\u2663' # 0x8E -> BLACK CLUB SUIT + '\u2665' # 0x8F -> BLACK HEART SUIT + '\u2660' # 0x90 -> BLACK SPADE SUIT + '\u2018' # 0x91 -> LEFT SINGLE QUOTATION MARK + '\u2019' # 0x92 -> RIGHT SINGLE QUOTATION MARK + '\u201c' # 0x93 -> LEFT DOUBLE QUOTATION MARK + '\u201d' # 0x94 -> RIGHT DOUBLE QUOTATION MARK + '\u2022' # 0x95 -> BULLET + '\u2013' # 0x96 -> EN DASH + '\u2014' # 0x97 -> EM DASH + '\u02dc' # 0x98 -> SMALL TILDE + '\u2122' # 0x99 -> TRADE MARK SIGN + '\u0161' # 0x9A -> LATIN SMALL LETTER S WITH CARON + '\x9b' # 0x9B -> + '\u0153' # 0x9C -> LATIN SMALL LIGATURE OE + '\x9d' # 0x9D -> + '\x9e' # 0x9E -> + '\u0178' # 0x9F -> LATIN CAPITAL LETTER Y WITH DIAERESIS + '\xa0' # 0xA0 -> NO-BREAK SPACE + '\xa1' # 0xA1 -> INVERTED EXCLAMATION MARK + '\xa2' # 0xA2 -> CENT SIGN + '\xa3' # 0xA3 -> POUND SIGN + '\xa4' # 0xA4 -> CURRENCY SIGN + '\xa5' # 0xA5 -> YEN SIGN + '\xa6' # 0xA6 -> BROKEN BAR + '\xa7' # 0xA7 -> SECTION SIGN + '\xa8' # 0xA8 -> DIAERESIS + '\xa9' # 0xA9 -> COPYRIGHT SIGN + '\xaa' # 0xAA -> FEMININE ORDINAL INDICATOR + '\xab' # 0xAB -> LEFT-POINTING DOUBLE ANGLE QUOTATION MARK + '\xac' # 0xAC -> NOT SIGN + '\xad' # 0xAD -> SOFT HYPHEN + '\xae' # 0xAE -> REGISTERED SIGN + '\xaf' # 0xAF -> MACRON + '\xb0' # 0xB0 -> DEGREE SIGN + '\xb1' # 0xB1 -> PLUS-MINUS SIGN + '\xb2' # 0xB2 -> SUPERSCRIPT TWO + '\xb3' # 0xB3 -> SUPERSCRIPT THREE + '\xb4' # 0xB4 -> ACUTE ACCENT + '\xb5' # 0xB5 -> MICRO SIGN + '\xb6' # 0xB6 -> PILCROW SIGN + '\xb7' # 0xB7 -> MIDDLE DOT + '\xb8' # 0xB8 -> CEDILLA + '\xb9' # 0xB9 -> SUPERSCRIPT ONE + '\xba' # 0xBA -> MASCULINE ORDINAL INDICATOR + '\xbb' # 0xBB -> RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK + '\xbc' # 0xBC -> VULGAR FRACTION ONE QUARTER + '\xbd' # 0xBD -> VULGAR FRACTION ONE HALF + '\xbe' # 0xBE -> VULGAR FRACTION THREE QUARTERS + '\xbf' # 0xBF -> INVERTED QUESTION MARK + '\xc0' # 0xC0 -> LATIN CAPITAL LETTER A WITH GRAVE + '\xc1' # 0xC1 -> LATIN CAPITAL LETTER A WITH ACUTE + '\xc2' # 0xC2 -> LATIN CAPITAL LETTER A WITH CIRCUMFLEX + '\xc3' # 0xC3 -> LATIN CAPITAL LETTER A WITH TILDE + '\xc4' # 0xC4 -> LATIN CAPITAL LETTER A WITH DIAERESIS + '\xc5' # 0xC5 -> LATIN CAPITAL LETTER A WITH RING ABOVE + '\xc6' # 0xC6 -> LATIN CAPITAL LETTER AE + '\xc7' # 0xC7 -> LATIN CAPITAL LETTER C WITH CEDILLA + '\xc8' # 0xC8 -> LATIN CAPITAL LETTER E WITH GRAVE + '\xc9' # 0xC9 -> LATIN CAPITAL LETTER E WITH ACUTE + '\xca' # 0xCA -> LATIN CAPITAL LETTER E WITH CIRCUMFLEX + '\xcb' # 0xCB -> LATIN CAPITAL LETTER E WITH DIAERESIS + '\xcc' # 0xCC -> LATIN CAPITAL LETTER I WITH GRAVE + '\xcd' # 0xCD -> LATIN CAPITAL LETTER I WITH ACUTE + '\xce' # 0xCE -> LATIN CAPITAL LETTER I WITH CIRCUMFLEX + '\xcf' # 0xCF -> LATIN CAPITAL LETTER I WITH DIAERESIS + '\xd0' # 0xD0 -> LATIN CAPITAL LETTER ETH (Icelandic) + '\xd1' # 0xD1 -> LATIN CAPITAL LETTER N WITH TILDE + '\xd2' # 0xD2 -> LATIN CAPITAL LETTER O WITH GRAVE + '\xd3' # 0xD3 -> LATIN CAPITAL LETTER O WITH ACUTE + '\xd4' # 0xD4 -> LATIN CAPITAL LETTER O WITH CIRCUMFLEX + '\xd5' # 0xD5 -> LATIN CAPITAL LETTER O WITH TILDE + '\xd6' # 0xD6 -> LATIN CAPITAL LETTER O WITH DIAERESIS + '\xd7' # 0xD7 -> MULTIPLICATION SIGN + '\xd8' # 0xD8 -> LATIN CAPITAL LETTER O WITH STROKE + '\xd9' # 0xD9 -> LATIN CAPITAL LETTER U WITH GRAVE + '\xda' # 0xDA -> LATIN CAPITAL LETTER U WITH ACUTE + '\xdb' # 0xDB -> LATIN CAPITAL LETTER U WITH CIRCUMFLEX + '\xdc' # 0xDC -> LATIN CAPITAL LETTER U WITH DIAERESIS + '\xdd' # 0xDD -> LATIN CAPITAL LETTER Y WITH ACUTE + '\xde' # 0xDE -> LATIN CAPITAL LETTER THORN (Icelandic) + '\xdf' # 0xDF -> LATIN SMALL LETTER SHARP S (German) + '\xe0' # 0xE0 -> LATIN SMALL LETTER A WITH GRAVE + '\xe1' # 0xE1 -> LATIN SMALL LETTER A WITH ACUTE + '\xe2' # 0xE2 -> LATIN SMALL LETTER A WITH CIRCUMFLEX + '\xe3' # 0xE3 -> LATIN SMALL LETTER A WITH TILDE + '\xe4' # 0xE4 -> LATIN SMALL LETTER A WITH DIAERESIS + '\xe5' # 0xE5 -> LATIN SMALL LETTER A WITH RING ABOVE + '\xe6' # 0xE6 -> LATIN SMALL LETTER AE + '\xe7' # 0xE7 -> LATIN SMALL LETTER C WITH CEDILLA + '\xe8' # 0xE8 -> LATIN SMALL LETTER E WITH GRAVE + '\xe9' # 0xE9 -> LATIN SMALL LETTER E WITH ACUTE + '\xea' # 0xEA -> LATIN SMALL LETTER E WITH CIRCUMFLEX + '\xeb' # 0xEB -> LATIN SMALL LETTER E WITH DIAERESIS + '\xec' # 0xEC -> LATIN SMALL LETTER I WITH GRAVE + '\xed' # 0xED -> LATIN SMALL LETTER I WITH ACUTE + '\xee' # 0xEE -> LATIN SMALL LETTER I WITH CIRCUMFLEX + '\xef' # 0xEF -> LATIN SMALL LETTER I WITH DIAERESIS + '\xf0' # 0xF0 -> LATIN SMALL LETTER ETH (Icelandic) + '\xf1' # 0xF1 -> LATIN SMALL LETTER N WITH TILDE + '\xf2' # 0xF2 -> LATIN SMALL LETTER O WITH GRAVE + '\xf3' # 0xF3 -> LATIN SMALL LETTER O WITH ACUTE + '\xf4' # 0xF4 -> LATIN SMALL LETTER O WITH CIRCUMFLEX + '\xf5' # 0xF5 -> LATIN SMALL LETTER O WITH TILDE + '\xf6' # 0xF6 -> LATIN SMALL LETTER O WITH DIAERESIS + '\xf7' # 0xF7 -> DIVISION SIGN + '\xf8' # 0xF8 -> LATIN SMALL LETTER O WITH STROKE + '\xf9' # 0xF9 -> LATIN SMALL LETTER U WITH GRAVE + '\xfa' # 0xFA -> LATIN SMALL LETTER U WITH ACUTE + '\xfb' # 0xFB -> LATIN SMALL LETTER U WITH CIRCUMFLEX + '\xfc' # 0xFC -> LATIN SMALL LETTER U WITH DIAERESIS + '\xfd' # 0xFD -> LATIN SMALL LETTER Y WITH ACUTE + '\xfe' # 0xFE -> LATIN SMALL LETTER THORN (Icelandic) + '\xff' # 0xFF -> LATIN SMALL LETTER Y WITH DIAERESIS +) + +### Encoding table +encoding_table=codecs.charmap_build(decoding_table) + diff --git a/Lib/encodings/ptcp154.py b/Lib/encodings/ptcp154.py index aef8975..656b79d 100644 --- a/Lib/encodings/ptcp154.py +++ b/Lib/encodings/ptcp154.py @@ -14,18 +14,18 @@ import codecs class Codec(codecs.Codec): def encode(self,input,errors='strict'): - return codecs.charmap_encode(input,errors,encoding_map) + return codecs.charmap_encode(input,errors,encoding_table) def decode(self,input,errors='strict'): - return codecs.charmap_decode(input,errors,decoding_map) + return codecs.charmap_decode(input,errors,decoding_table) class IncrementalEncoder(codecs.IncrementalEncoder): def encode(self, input, final=False): - return codecs.charmap_encode(input,self.errors,encoding_map)[0] + return codecs.charmap_encode(input,self.errors,encoding_table)[0] class IncrementalDecoder(codecs.IncrementalDecoder): def decode(self, input, final=False): - return codecs.charmap_decode(input,self.errors,decoding_map)[0] + return codecs.charmap_decode(input,self.errors,decoding_table)[0] class StreamWriter(Codec,codecs.StreamWriter): pass @@ -46,130 +46,267 @@ def getregentry(): streamwriter=StreamWriter, ) -### Decoding Map - -decoding_map = codecs.make_identity_dict(range(256)) -decoding_map.update({ - 0x0080: 0x0496, # CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER - 0x0081: 0x0492, # CYRILLIC CAPITAL LETTER GHE WITH STROKE - 0x0082: 0x04ee, # CYRILLIC CAPITAL LETTER U WITH MACRON - 0x0083: 0x0493, # CYRILLIC SMALL LETTER GHE WITH STROKE - 0x0084: 0x201e, # DOUBLE LOW-9 QUOTATION MARK - 0x0085: 0x2026, # HORIZONTAL ELLIPSIS - 0x0086: 0x04b6, # CYRILLIC CAPITAL LETTER CHE WITH DESCENDER - 0x0087: 0x04ae, # CYRILLIC CAPITAL LETTER STRAIGHT U - 0x0088: 0x04b2, # CYRILLIC CAPITAL LETTER HA WITH DESCENDER - 0x0089: 0x04af, # CYRILLIC SMALL LETTER STRAIGHT U - 0x008a: 0x04a0, # CYRILLIC CAPITAL LETTER BASHKIR KA - 0x008b: 0x04e2, # CYRILLIC CAPITAL LETTER I WITH MACRON - 0x008c: 0x04a2, # CYRILLIC CAPITAL LETTER EN WITH DESCENDER - 0x008d: 0x049a, # CYRILLIC CAPITAL LETTER KA WITH DESCENDER - 0x008e: 0x04ba, # CYRILLIC CAPITAL LETTER SHHA - 0x008f: 0x04b8, # CYRILLIC CAPITAL LETTER CHE WITH VERTICAL STROKE - 0x0090: 0x0497, # CYRILLIC SMALL LETTER ZHE WITH DESCENDER - 0x0091: 0x2018, # LEFT SINGLE QUOTATION MARK - 0x0092: 0x2019, # RIGHT SINGLE QUOTATION MARK - 0x0093: 0x201c, # LEFT DOUBLE QUOTATION MARK - 0x0094: 0x201d, # RIGHT DOUBLE QUOTATION MARK - 0x0095: 0x2022, # BULLET - 0x0096: 0x2013, # EN DASH - 0x0097: 0x2014, # EM DASH - 0x0098: 0x04b3, # CYRILLIC SMALL LETTER HA WITH DESCENDER - 0x0099: 0x04b7, # CYRILLIC SMALL LETTER CHE WITH DESCENDER - 0x009a: 0x04a1, # CYRILLIC SMALL LETTER BASHKIR KA - 0x009b: 0x04e3, # CYRILLIC SMALL LETTER I WITH MACRON - 0x009c: 0x04a3, # CYRILLIC SMALL LETTER EN WITH DESCENDER - 0x009d: 0x049b, # CYRILLIC SMALL LETTER KA WITH DESCENDER - 0x009e: 0x04bb, # CYRILLIC SMALL LETTER SHHA - 0x009f: 0x04b9, # CYRILLIC SMALL LETTER CHE WITH VERTICAL STROKE - 0x00a1: 0x040e, # CYRILLIC CAPITAL LETTER SHORT U (Byelorussian) - 0x00a2: 0x045e, # CYRILLIC SMALL LETTER SHORT U (Byelorussian) - 0x00a3: 0x0408, # CYRILLIC CAPITAL LETTER JE - 0x00a4: 0x04e8, # CYRILLIC CAPITAL LETTER BARRED O - 0x00a5: 0x0498, # CYRILLIC CAPITAL LETTER ZE WITH DESCENDER - 0x00a6: 0x04b0, # CYRILLIC CAPITAL LETTER STRAIGHT U WITH STROKE - 0x00a8: 0x0401, # CYRILLIC CAPITAL LETTER IO - 0x00aa: 0x04d8, # CYRILLIC CAPITAL LETTER SCHWA - 0x00ad: 0x04ef, # CYRILLIC SMALL LETTER U WITH MACRON - 0x00af: 0x049c, # CYRILLIC CAPITAL LETTER KA WITH VERTICAL STROKE - 0x00b1: 0x04b1, # CYRILLIC SMALL LETTER STRAIGHT U WITH STROKE - 0x00b2: 0x0406, # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I - 0x00b3: 0x0456, # CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I - 0x00b4: 0x0499, # CYRILLIC SMALL LETTER ZE WITH DESCENDER - 0x00b5: 0x04e9, # CYRILLIC SMALL LETTER BARRED O - 0x00b8: 0x0451, # CYRILLIC SMALL LETTER IO - 0x00b9: 0x2116, # NUMERO SIGN - 0x00ba: 0x04d9, # CYRILLIC SMALL LETTER SCHWA - 0x00bc: 0x0458, # CYRILLIC SMALL LETTER JE - 0x00bd: 0x04aa, # CYRILLIC CAPITAL LETTER ES WITH DESCENDER - 0x00be: 0x04ab, # CYRILLIC SMALL LETTER ES WITH DESCENDER - 0x00bf: 0x049d, # CYRILLIC SMALL LETTER KA WITH VERTICAL STROKE - 0x00c0: 0x0410, # CYRILLIC CAPITAL LETTER A - 0x00c1: 0x0411, # CYRILLIC CAPITAL LETTER BE - 0x00c2: 0x0412, # CYRILLIC CAPITAL LETTER VE - 0x00c3: 0x0413, # CYRILLIC CAPITAL LETTER GHE - 0x00c4: 0x0414, # CYRILLIC CAPITAL LETTER DE - 0x00c5: 0x0415, # CYRILLIC CAPITAL LETTER IE - 0x00c6: 0x0416, # CYRILLIC CAPITAL LETTER ZHE - 0x00c7: 0x0417, # CYRILLIC CAPITAL LETTER ZE - 0x00c8: 0x0418, # CYRILLIC CAPITAL LETTER I - 0x00c9: 0x0419, # CYRILLIC CAPITAL LETTER SHORT I - 0x00ca: 0x041a, # CYRILLIC CAPITAL LETTER KA - 0x00cb: 0x041b, # CYRILLIC CAPITAL LETTER EL - 0x00cc: 0x041c, # CYRILLIC CAPITAL LETTER EM - 0x00cd: 0x041d, # CYRILLIC CAPITAL LETTER EN - 0x00ce: 0x041e, # CYRILLIC CAPITAL LETTER O - 0x00cf: 0x041f, # CYRILLIC CAPITAL LETTER PE - 0x00d0: 0x0420, # CYRILLIC CAPITAL LETTER ER - 0x00d1: 0x0421, # CYRILLIC CAPITAL LETTER ES - 0x00d2: 0x0422, # CYRILLIC CAPITAL LETTER TE - 0x00d3: 0x0423, # CYRILLIC CAPITAL LETTER U - 0x00d4: 0x0424, # CYRILLIC CAPITAL LETTER EF - 0x00d5: 0x0425, # CYRILLIC CAPITAL LETTER HA - 0x00d6: 0x0426, # CYRILLIC CAPITAL LETTER TSE - 0x00d7: 0x0427, # CYRILLIC CAPITAL LETTER CHE - 0x00d8: 0x0428, # CYRILLIC CAPITAL LETTER SHA - 0x00d9: 0x0429, # CYRILLIC CAPITAL LETTER SHCHA - 0x00da: 0x042a, # CYRILLIC CAPITAL LETTER HARD SIGN - 0x00db: 0x042b, # CYRILLIC CAPITAL LETTER YERU - 0x00dc: 0x042c, # CYRILLIC CAPITAL LETTER SOFT SIGN - 0x00dd: 0x042d, # CYRILLIC CAPITAL LETTER E - 0x00de: 0x042e, # CYRILLIC CAPITAL LETTER YU - 0x00df: 0x042f, # CYRILLIC CAPITAL LETTER YA - 0x00e0: 0x0430, # CYRILLIC SMALL LETTER A - 0x00e1: 0x0431, # CYRILLIC SMALL LETTER BE - 0x00e2: 0x0432, # CYRILLIC SMALL LETTER VE - 0x00e3: 0x0433, # CYRILLIC SMALL LETTER GHE - 0x00e4: 0x0434, # CYRILLIC SMALL LETTER DE - 0x00e5: 0x0435, # CYRILLIC SMALL LETTER IE - 0x00e6: 0x0436, # CYRILLIC SMALL LETTER ZHE - 0x00e7: 0x0437, # CYRILLIC SMALL LETTER ZE - 0x00e8: 0x0438, # CYRILLIC SMALL LETTER I - 0x00e9: 0x0439, # CYRILLIC SMALL LETTER SHORT I - 0x00ea: 0x043a, # CYRILLIC SMALL LETTER KA - 0x00eb: 0x043b, # CYRILLIC SMALL LETTER EL - 0x00ec: 0x043c, # CYRILLIC SMALL LETTER EM - 0x00ed: 0x043d, # CYRILLIC SMALL LETTER EN - 0x00ee: 0x043e, # CYRILLIC SMALL LETTER O - 0x00ef: 0x043f, # CYRILLIC SMALL LETTER PE - 0x00f0: 0x0440, # CYRILLIC SMALL LETTER ER - 0x00f1: 0x0441, # CYRILLIC SMALL LETTER ES - 0x00f2: 0x0442, # CYRILLIC SMALL LETTER TE - 0x00f3: 0x0443, # CYRILLIC SMALL LETTER U - 0x00f4: 0x0444, # CYRILLIC SMALL LETTER EF - 0x00f5: 0x0445, # CYRILLIC SMALL LETTER HA - 0x00f6: 0x0446, # CYRILLIC SMALL LETTER TSE - 0x00f7: 0x0447, # CYRILLIC SMALL LETTER CHE - 0x00f8: 0x0448, # CYRILLIC SMALL LETTER SHA - 0x00f9: 0x0449, # CYRILLIC SMALL LETTER SHCHA - 0x00fa: 0x044a, # CYRILLIC SMALL LETTER HARD SIGN - 0x00fb: 0x044b, # CYRILLIC SMALL LETTER YERU - 0x00fc: 0x044c, # CYRILLIC SMALL LETTER SOFT SIGN - 0x00fd: 0x044d, # CYRILLIC SMALL LETTER E - 0x00fe: 0x044e, # CYRILLIC SMALL LETTER YU - 0x00ff: 0x044f, # CYRILLIC SMALL LETTER YA -}) - -### Encoding Map - -encoding_map = codecs.make_encoding_map(decoding_map) + +### Decoding Table + +decoding_table = ( + '\x00' # 0x00 -> NULL + '\x01' # 0x01 -> START OF HEADING + '\x02' # 0x02 -> START OF TEXT + '\x03' # 0x03 -> END OF TEXT + '\x04' # 0x04 -> END OF TRANSMISSION + '\x05' # 0x05 -> ENQUIRY + '\x06' # 0x06 -> ACKNOWLEDGE + '\x07' # 0x07 -> BELL + '\x08' # 0x08 -> BACKSPACE + '\t' # 0x09 -> HORIZONTAL TABULATION + '\n' # 0x0A -> LINE FEED + '\x0b' # 0x0B -> VERTICAL TABULATION + '\x0c' # 0x0C -> FORM FEED + '\r' # 0x0D -> CARRIAGE RETURN + '\x0e' # 0x0E -> SHIFT OUT + '\x0f' # 0x0F -> SHIFT IN + '\x10' # 0x10 -> DATA LINK ESCAPE + '\x11' # 0x11 -> DEVICE CONTROL ONE + '\x12' # 0x12 -> DEVICE CONTROL TWO + '\x13' # 0x13 -> DEVICE CONTROL THREE + '\x14' # 0x14 -> DEVICE CONTROL FOUR + '\x15' # 0x15 -> NEGATIVE ACKNOWLEDGE + '\x16' # 0x16 -> SYNCHRONOUS IDLE + '\x17' # 0x17 -> END OF TRANSMISSION BLOCK + '\x18' # 0x18 -> CANCEL + '\x19' # 0x19 -> END OF MEDIUM + '\x1a' # 0x1A -> SUBSTITUTE + '\x1b' # 0x1B -> ESCAPE + '\x1c' # 0x1C -> FILE SEPARATOR + '\x1d' # 0x1D -> GROUP SEPARATOR + '\x1e' # 0x1E -> RECORD SEPARATOR + '\x1f' # 0x1F -> UNIT SEPARATOR + ' ' # 0x20 -> SPACE + '!' # 0x21 -> EXCLAMATION MARK + '"' # 0x22 -> QUOTATION MARK + '#' # 0x23 -> NUMBER SIGN + '$' # 0x24 -> DOLLAR SIGN + '%' # 0x25 -> PERCENT SIGN + '&' # 0x26 -> AMPERSAND + "'" # 0x27 -> APOSTROPHE + '(' # 0x28 -> LEFT PARENTHESIS + ')' # 0x29 -> RIGHT PARENTHESIS + '*' # 0x2A -> ASTERISK + '+' # 0x2B -> PLUS SIGN + ',' # 0x2C -> COMMA + '-' # 0x2D -> HYPHEN-MINUS + '.' # 0x2E -> FULL STOP + '/' # 0x2F -> SOLIDUS + '0' # 0x30 -> DIGIT ZERO + '1' # 0x31 -> DIGIT ONE + '2' # 0x32 -> DIGIT TWO + '3' # 0x33 -> DIGIT THREE + '4' # 0x34 -> DIGIT FOUR + '5' # 0x35 -> DIGIT FIVE + '6' # 0x36 -> DIGIT SIX + '7' # 0x37 -> DIGIT SEVEN + '8' # 0x38 -> DIGIT EIGHT + '9' # 0x39 -> DIGIT NINE + ':' # 0x3A -> COLON + ';' # 0x3B -> SEMICOLON + '<' # 0x3C -> LESS-THAN SIGN + '=' # 0x3D -> EQUALS SIGN + '>' # 0x3E -> GREATER-THAN SIGN + '?' # 0x3F -> QUESTION MARK + '@' # 0x40 -> COMMERCIAL AT + 'A' # 0x41 -> LATIN CAPITAL LETTER A + 'B' # 0x42 -> LATIN CAPITAL LETTER B + 'C' # 0x43 -> LATIN CAPITAL LETTER C + 'D' # 0x44 -> LATIN CAPITAL LETTER D + 'E' # 0x45 -> LATIN CAPITAL LETTER E + 'F' # 0x46 -> LATIN CAPITAL LETTER F + 'G' # 0x47 -> LATIN CAPITAL LETTER G + 'H' # 0x48 -> LATIN CAPITAL LETTER H + 'I' # 0x49 -> LATIN CAPITAL LETTER I + 'J' # 0x4A -> LATIN CAPITAL LETTER J + 'K' # 0x4B -> LATIN CAPITAL LETTER K + 'L' # 0x4C -> LATIN CAPITAL LETTER L + 'M' # 0x4D -> LATIN CAPITAL LETTER M + 'N' # 0x4E -> LATIN CAPITAL LETTER N + 'O' # 0x4F -> LATIN CAPITAL LETTER O + 'P' # 0x50 -> LATIN CAPITAL LETTER P + 'Q' # 0x51 -> LATIN CAPITAL LETTER Q + 'R' # 0x52 -> LATIN CAPITAL LETTER R + 'S' # 0x53 -> LATIN CAPITAL LETTER S + 'T' # 0x54 -> LATIN CAPITAL LETTER T + 'U' # 0x55 -> LATIN CAPITAL LETTER U + 'V' # 0x56 -> LATIN CAPITAL LETTER V + 'W' # 0x57 -> LATIN CAPITAL LETTER W + 'X' # 0x58 -> LATIN CAPITAL LETTER X + 'Y' # 0x59 -> LATIN CAPITAL LETTER Y + 'Z' # 0x5A -> LATIN CAPITAL LETTER Z + '[' # 0x5B -> LEFT SQUARE BRACKET + '\\' # 0x5C -> REVERSE SOLIDUS + ']' # 0x5D -> RIGHT SQUARE BRACKET + '^' # 0x5E -> CIRCUMFLEX ACCENT + '_' # 0x5F -> LOW LINE + '`' # 0x60 -> GRAVE ACCENT + 'a' # 0x61 -> LATIN SMALL LETTER A + 'b' # 0x62 -> LATIN SMALL LETTER B + 'c' # 0x63 -> LATIN SMALL LETTER C + 'd' # 0x64 -> LATIN SMALL LETTER D + 'e' # 0x65 -> LATIN SMALL LETTER E + 'f' # 0x66 -> LATIN SMALL LETTER F + 'g' # 0x67 -> LATIN SMALL LETTER G + 'h' # 0x68 -> LATIN SMALL LETTER H + 'i' # 0x69 -> LATIN SMALL LETTER I + 'j' # 0x6A -> LATIN SMALL LETTER J + 'k' # 0x6B -> LATIN SMALL LETTER K + 'l' # 0x6C -> LATIN SMALL LETTER L + 'm' # 0x6D -> LATIN SMALL LETTER M + 'n' # 0x6E -> LATIN SMALL LETTER N + 'o' # 0x6F -> LATIN SMALL LETTER O + 'p' # 0x70 -> LATIN SMALL LETTER P + 'q' # 0x71 -> LATIN SMALL LETTER Q + 'r' # 0x72 -> LATIN SMALL LETTER R + 's' # 0x73 -> LATIN SMALL LETTER S + 't' # 0x74 -> LATIN SMALL LETTER T + 'u' # 0x75 -> LATIN SMALL LETTER U + 'v' # 0x76 -> LATIN SMALL LETTER V + 'w' # 0x77 -> LATIN SMALL LETTER W + 'x' # 0x78 -> LATIN SMALL LETTER X + 'y' # 0x79 -> LATIN SMALL LETTER Y + 'z' # 0x7A -> LATIN SMALL LETTER Z + '{' # 0x7B -> LEFT CURLY BRACKET + '|' # 0x7C -> VERTICAL LINE + '}' # 0x7D -> RIGHT CURLY BRACKET + '~' # 0x7E -> TILDE + '\x7f' # 0x7F -> DELETE (DEL) + '\u0496' # 0x80 -> CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER + '\u0492' # 0x81 -> CYRILLIC CAPITAL LETTER GHE WITH STROKE + '\u04ee' # 0x82 -> CYRILLIC CAPITAL LETTER U WITH MACRON + '\u0493' # 0x83 -> CYRILLIC SMALL LETTER GHE WITH STROKE + '\u201e' # 0x84 -> DOUBLE LOW-9 QUOTATION MARK + '\u2026' # 0x85 -> HORIZONTAL ELLIPSIS + '\u04b6' # 0x86 -> CYRILLIC CAPITAL LETTER CHE WITH DESCENDER + '\u04ae' # 0x87 -> CYRILLIC CAPITAL LETTER STRAIGHT U + '\u04b2' # 0x88 -> CYRILLIC CAPITAL LETTER HA WITH DESCENDER + '\u04af' # 0x89 -> CYRILLIC SMALL LETTER STRAIGHT U + '\u04a0' # 0x8A -> CYRILLIC CAPITAL LETTER BASHKIR KA + '\u04e2' # 0x8B -> CYRILLIC CAPITAL LETTER I WITH MACRON + '\u04a2' # 0x8C -> CYRILLIC CAPITAL LETTER EN WITH DESCENDER + '\u049a' # 0x8D -> CYRILLIC CAPITAL LETTER KA WITH DESCENDER + '\u04ba' # 0x8E -> CYRILLIC CAPITAL LETTER SHHA + '\u04b8' # 0x8F -> CYRILLIC CAPITAL LETTER CHE WITH VERTICAL STROKE + '\u0497' # 0x90 -> CYRILLIC SMALL LETTER ZHE WITH DESCENDER + '\u2018' # 0x91 -> LEFT SINGLE QUOTATION MARK + '\u2019' # 0x92 -> RIGHT SINGLE QUOTATION MARK + '\u201c' # 0x93 -> LEFT DOUBLE QUOTATION MARK + '\u201d' # 0x94 -> RIGHT DOUBLE QUOTATION MARK + '\u2022' # 0x95 -> BULLET + '\u2013' # 0x96 -> EN DASH + '\u2014' # 0x97 -> EM DASH + '\u04b3' # 0x98 -> CYRILLIC SMALL LETTER HA WITH DESCENDER + '\u04b7' # 0x99 -> CYRILLIC SMALL LETTER CHE WITH DESCENDER + '\u04a1' # 0x9A -> CYRILLIC SMALL LETTER BASHKIR KA + '\u04e3' # 0x9B -> CYRILLIC SMALL LETTER I WITH MACRON + '\u04a3' # 0x9C -> CYRILLIC SMALL LETTER EN WITH DESCENDER + '\u049b' # 0x9D -> CYRILLIC SMALL LETTER KA WITH DESCENDER + '\u04bb' # 0x9E -> CYRILLIC SMALL LETTER SHHA + '\u04b9' # 0x9F -> CYRILLIC SMALL LETTER CHE WITH VERTICAL STROKE + '\xa0' # 0xA0 -> NO-BREAK SPACE + '\u040e' # 0xA1 -> CYRILLIC CAPITAL LETTER SHORT U (Byelorussian) + '\u045e' # 0xA2 -> CYRILLIC SMALL LETTER SHORT U (Byelorussian) + '\u0408' # 0xA3 -> CYRILLIC CAPITAL LETTER JE + '\u04e8' # 0xA4 -> CYRILLIC CAPITAL LETTER BARRED O + '\u0498' # 0xA5 -> CYRILLIC CAPITAL LETTER ZE WITH DESCENDER + '\u04b0' # 0xA6 -> CYRILLIC CAPITAL LETTER STRAIGHT U WITH STROKE + '\xa7' # 0xA7 -> SECTION SIGN + '\u0401' # 0xA8 -> CYRILLIC CAPITAL LETTER IO + '\xa9' # 0xA9 -> COPYRIGHT SIGN + '\u04d8' # 0xAA -> CYRILLIC CAPITAL LETTER SCHWA + '\xab' # 0xAB -> LEFT-POINTING DOUBLE ANGLE QUOTATION MARK + '\xac' # 0xAC -> NOT SIGN + '\u04ef' # 0xAD -> CYRILLIC SMALL LETTER U WITH MACRON + '\xae' # 0xAE -> REGISTERED SIGN + '\u049c' # 0xAF -> CYRILLIC CAPITAL LETTER KA WITH VERTICAL STROKE + '\xb0' # 0xB0 -> DEGREE SIGN + '\u04b1' # 0xB1 -> CYRILLIC SMALL LETTER STRAIGHT U WITH STROKE + '\u0406' # 0xB2 -> CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I + '\u0456' # 0xB3 -> CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I + '\u0499' # 0xB4 -> CYRILLIC SMALL LETTER ZE WITH DESCENDER + '\u04e9' # 0xB5 -> CYRILLIC SMALL LETTER BARRED O + '\xb6' # 0xB6 -> PILCROW SIGN + '\xb7' # 0xB7 -> MIDDLE DOT + '\u0451' # 0xB8 -> CYRILLIC SMALL LETTER IO + '\u2116' # 0xB9 -> NUMERO SIGN + '\u04d9' # 0xBA -> CYRILLIC SMALL LETTER SCHWA + '\xbb' # 0xBB -> RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK + '\u0458' # 0xBC -> CYRILLIC SMALL LETTER JE + '\u04aa' # 0xBD -> CYRILLIC CAPITAL LETTER ES WITH DESCENDER + '\u04ab' # 0xBE -> CYRILLIC SMALL LETTER ES WITH DESCENDER + '\u049d' # 0xBF -> CYRILLIC SMALL LETTER KA WITH VERTICAL STROKE + '\u0410' # 0xC0 -> CYRILLIC CAPITAL LETTER A + '\u0411' # 0xC1 -> CYRILLIC CAPITAL LETTER BE + '\u0412' # 0xC2 -> CYRILLIC CAPITAL LETTER VE + '\u0413' # 0xC3 -> CYRILLIC CAPITAL LETTER GHE + '\u0414' # 0xC4 -> CYRILLIC CAPITAL LETTER DE + '\u0415' # 0xC5 -> CYRILLIC CAPITAL LETTER IE + '\u0416' # 0xC6 -> CYRILLIC CAPITAL LETTER ZHE + '\u0417' # 0xC7 -> CYRILLIC CAPITAL LETTER ZE + '\u0418' # 0xC8 -> CYRILLIC CAPITAL LETTER I + '\u0419' # 0xC9 -> CYRILLIC CAPITAL LETTER SHORT I + '\u041a' # 0xCA -> CYRILLIC CAPITAL LETTER KA + '\u041b' # 0xCB -> CYRILLIC CAPITAL LETTER EL + '\u041c' # 0xCC -> CYRILLIC CAPITAL LETTER EM + '\u041d' # 0xCD -> CYRILLIC CAPITAL LETTER EN + '\u041e' # 0xCE -> CYRILLIC CAPITAL LETTER O + '\u041f' # 0xCF -> CYRILLIC CAPITAL LETTER PE + '\u0420' # 0xD0 -> CYRILLIC CAPITAL LETTER ER + '\u0421' # 0xD1 -> CYRILLIC CAPITAL LETTER ES + '\u0422' # 0xD2 -> CYRILLIC CAPITAL LETTER TE + '\u0423' # 0xD3 -> CYRILLIC CAPITAL LETTER U + '\u0424' # 0xD4 -> CYRILLIC CAPITAL LETTER EF + '\u0425' # 0xD5 -> CYRILLIC CAPITAL LETTER HA + '\u0426' # 0xD6 -> CYRILLIC CAPITAL LETTER TSE + '\u0427' # 0xD7 -> CYRILLIC CAPITAL LETTER CHE + '\u0428' # 0xD8 -> CYRILLIC CAPITAL LETTER SHA + '\u0429' # 0xD9 -> CYRILLIC CAPITAL LETTER SHCHA + '\u042a' # 0xDA -> CYRILLIC CAPITAL LETTER HARD SIGN + '\u042b' # 0xDB -> CYRILLIC CAPITAL LETTER YERU + '\u042c' # 0xDC -> CYRILLIC CAPITAL LETTER SOFT SIGN + '\u042d' # 0xDD -> CYRILLIC CAPITAL LETTER E + '\u042e' # 0xDE -> CYRILLIC CAPITAL LETTER YU + '\u042f' # 0xDF -> CYRILLIC CAPITAL LETTER YA + '\u0430' # 0xE0 -> CYRILLIC SMALL LETTER A + '\u0431' # 0xE1 -> CYRILLIC SMALL LETTER BE + '\u0432' # 0xE2 -> CYRILLIC SMALL LETTER VE + '\u0433' # 0xE3 -> CYRILLIC SMALL LETTER GHE + '\u0434' # 0xE4 -> CYRILLIC SMALL LETTER DE + '\u0435' # 0xE5 -> CYRILLIC SMALL LETTER IE + '\u0436' # 0xE6 -> CYRILLIC SMALL LETTER ZHE + '\u0437' # 0xE7 -> CYRILLIC SMALL LETTER ZE + '\u0438' # 0xE8 -> CYRILLIC SMALL LETTER I + '\u0439' # 0xE9 -> CYRILLIC SMALL LETTER SHORT I + '\u043a' # 0xEA -> CYRILLIC SMALL LETTER KA + '\u043b' # 0xEB -> CYRILLIC SMALL LETTER EL + '\u043c' # 0xEC -> CYRILLIC SMALL LETTER EM + '\u043d' # 0xED -> CYRILLIC SMALL LETTER EN + '\u043e' # 0xEE -> CYRILLIC SMALL LETTER O + '\u043f' # 0xEF -> CYRILLIC SMALL LETTER PE + '\u0440' # 0xF0 -> CYRILLIC SMALL LETTER ER + '\u0441' # 0xF1 -> CYRILLIC SMALL LETTER ES + '\u0442' # 0xF2 -> CYRILLIC SMALL LETTER TE + '\u0443' # 0xF3 -> CYRILLIC SMALL LETTER U + '\u0444' # 0xF4 -> CYRILLIC SMALL LETTER EF + '\u0445' # 0xF5 -> CYRILLIC SMALL LETTER HA + '\u0446' # 0xF6 -> CYRILLIC SMALL LETTER TSE + '\u0447' # 0xF7 -> CYRILLIC SMALL LETTER CHE + '\u0448' # 0xF8 -> CYRILLIC SMALL LETTER SHA + '\u0449' # 0xF9 -> CYRILLIC SMALL LETTER SHCHA + '\u044a' # 0xFA -> CYRILLIC SMALL LETTER HARD SIGN + '\u044b' # 0xFB -> CYRILLIC SMALL LETTER YERU + '\u044c' # 0xFC -> CYRILLIC SMALL LETTER SOFT SIGN + '\u044d' # 0xFD -> CYRILLIC SMALL LETTER E + '\u044e' # 0xFE -> CYRILLIC SMALL LETTER YU + '\u044f' # 0xFF -> CYRILLIC SMALL LETTER YA +) + +### Encoding table +encoding_table=codecs.charmap_build(decoding_table) diff --git a/Misc/NEWS b/Misc/NEWS index 7e20d1b..46f87bc 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,9 @@ What's New in Python 3.3.0 Beta 1? Core and Builtins ----------------- +- Issue #14874: Restore charmap decoding speed to pre-PEP 393 levels. + Patch by Serhiy Storchaka. + - Issue #15026: utf-16 encoding is now significantly faster (up to 10x). Patch by Serhiy Storchaka. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index ce82717..353d2bb 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -7435,24 +7435,53 @@ PyUnicode_DecodeCharmap(const char *s, e = s + size; if (PyUnicode_CheckExact(mapping)) { Py_ssize_t maplen; - enum PyUnicode_Kind kind; - void *data; + enum PyUnicode_Kind mapkind; + void *mapdata; Py_UCS4 x; if (PyUnicode_READY(mapping) == -1) return NULL; maplen = PyUnicode_GET_LENGTH(mapping); - data = PyUnicode_DATA(mapping); - kind = PyUnicode_KIND(mapping); + mapdata = PyUnicode_DATA(mapping); + mapkind = PyUnicode_KIND(mapping); while (s < e) { - unsigned char ch = *s; + unsigned char ch; + if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) { + enum PyUnicode_Kind outkind = PyUnicode_KIND(v); + if (outkind == PyUnicode_1BYTE_KIND) { + void *outdata = PyUnicode_DATA(v); + Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v); + while (s < e) { + unsigned char ch = *s; + x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch); + if (x > maxchar) + goto Error; + PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x); + ++s; + } + break; + } + else if (outkind == PyUnicode_2BYTE_KIND) { + void *outdata = PyUnicode_DATA(v); + while (s < e) { + unsigned char ch = *s; + x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch); + if (x == 0xFFFE) + goto Error; + PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x); + ++s; + } + break; + } + } + ch = *s; if (ch < maplen) - x = PyUnicode_READ(kind, data, ch); + x = PyUnicode_READ(mapkind, mapdata, ch); else x = 0xfffe; /* invalid value */ - +Error: if (x == 0xfffe) { /* undefined mapping */ @@ -7667,14 +7696,17 @@ PyUnicode_BuildEncodingMap(PyObject* string) int count2 = 0, count3 = 0; int kind; void *data; + Py_ssize_t length; Py_UCS4 ch; - if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) { + if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) { PyErr_BadArgument(); return NULL; } kind = PyUnicode_KIND(string); data = PyUnicode_DATA(string); + length = PyUnicode_GET_LENGTH(string); + length = Py_MIN(length, 256); memset(level1, 0xFF, sizeof level1); memset(level2, 0xFF, sizeof level2); @@ -7683,7 +7715,7 @@ PyUnicode_BuildEncodingMap(PyObject* string) a mapping dictionary. */ if (PyUnicode_READ(kind, data, 0) != 0) need_dict = 1; - for (i = 1; i < 256; i++) { + for (i = 1; i < length; i++) { int l1, l2; ch = PyUnicode_READ(kind, data, i); if (ch == 0 || ch > 0xFFFF) { @@ -7709,7 +7741,7 @@ PyUnicode_BuildEncodingMap(PyObject* string) PyObject *key, *value; if (!result) return NULL; - for (i = 0; i < 256; i++) { + for (i = 0; i < length; i++) { key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); value = PyLong_FromLong(i); if (!key || !value) @@ -7743,17 +7775,18 @@ PyUnicode_BuildEncodingMap(PyObject* string) memset(mlevel2, 0xFF, 16*count2); memset(mlevel3, 0, 128*count3); count3 = 0; - for (i = 1; i < 256; i++) { + for (i = 1; i < length; i++) { int o1, o2, o3, i2, i3; - if (PyUnicode_READ(kind, data, i) == 0xFFFE) + Py_UCS4 ch = PyUnicode_READ(kind, data, i); + if (ch == 0xFFFE) /* unmapped character */ continue; - o1 = PyUnicode_READ(kind, data, i)>>11; - o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF; + o1 = ch>>11; + o2 = (ch>>7) & 0xF; i2 = 16*mlevel1[o1] + o2; if (mlevel2[i2] == 0xFF) mlevel2[i2] = count3++; - o3 = PyUnicode_READ(kind, data, i) & 0x7F; + o3 = ch & 0x7F; i3 = 128*mlevel2[i2] + o3; mlevel3[i3] = i; } diff --git a/Tools/unicode/gencodec.py b/Tools/unicode/gencodec.py index 7e7d6d0..f5a1af3 100644 --- a/Tools/unicode/gencodec.py +++ b/Tools/unicode/gencodec.py @@ -102,7 +102,7 @@ def readmap(filename): comment = '' else: comment = comment[1:].strip() - if enc < 256: + if not isinstance(enc, tuple) and enc < 256: if enc in unmapped: unmapped.remove(enc) if enc == uni: @@ -202,11 +202,10 @@ def python_tabledef_code(varname, map, comments=1, key_precision=2): # Analyze map and create table dict mappings = sorted(map.items()) table = {} - maxkey = 0 + maxkey = 255 if 'IDENTITY' in map: for key in range(256): table[key] = (key, '') - maxkey = 255 del map['IDENTITY'] for mapkey, mapvalue in mappings: mapcomment = '' @@ -224,6 +223,7 @@ def python_tabledef_code(varname, map, comments=1, key_precision=2): return None # Create table code + maxchar = 0 for key in range(maxkey + 1): if key not in table: mapvalue = MISSING_CODE @@ -238,6 +238,7 @@ def python_tabledef_code(varname, map, comments=1, key_precision=2): return None else: mapchar = chr(mapvalue) + maxchar = max(maxchar, ord(mapchar)) if mapcomment and comments: append(' %a \t# %s -> %s' % (mapchar, hexrepr(key, key_precision), @@ -245,6 +246,8 @@ def python_tabledef_code(varname, map, comments=1, key_precision=2): else: append(' %a' % mapchar) + if maxchar < 256: + append(' %a \t## Widen to UCS2 for optimization' % UNI_UNDEFINED) append(')') return l -- cgit v0.12