diff options
author | Marc-André Lemburg <mal@egenix.com> | 2001-05-15 12:00:02 (GMT) |
---|---|---|
committer | Marc-André Lemburg <mal@egenix.com> | 2001-05-15 12:00:02 (GMT) |
commit | 2d9204199fe8913cca9890f1822413d981587ee5 (patch) | |
tree | f0734f9c8721508ebbd472cbc46abd9aa66c44dd /Lib | |
parent | 2e0a654f6edeb58bef3cccffa42c2a236117a88c (diff) | |
download | cpython-2d9204199fe8913cca9890f1822413d981587ee5.zip cpython-2d9204199fe8913cca9890f1822413d981587ee5.tar.gz cpython-2d9204199fe8913cca9890f1822413d981587ee5.tar.bz2 |
This patch changes the way the string .encode() method works slightly
and introduces a new method .decode().
The major change is that strg.encode() will no longer try to convert
Unicode returns from the codec into a string, but instead pass along
the Unicode object as-is. The same is now true for all other codec
return types. The underlying C APIs were changed accordingly.
Note that even though this does have the potential of breaking
existing code, the chances are low since conversion from Unicode
previously took place using the default encoding which is normally
set to ASCII rendering this auto-conversion mechanism useless for
most Unicode encodings.
The good news is that you can now use .encode() and .decode() with
much greater ease and that the door was opened for better accessibility
of the builtin codecs.
As demonstration of the new feature, the patch includes a few new
codecs which allow string to string encoding and decoding (rot13,
hex, zip, uu, base64).
Written by Marc-Andre Lemburg. Copyright assigned to the PSF.
Diffstat (limited to 'Lib')
-rwxr-xr-x | Lib/UserString.py | 8 | ||||
-rw-r--r-- | Lib/encodings/aliases.py | 9 | ||||
-rw-r--r-- | Lib/encodings/base64_codec.py | 60 | ||||
-rw-r--r-- | Lib/encodings/hex_codec.py | 60 | ||||
-rw-r--r-- | Lib/encodings/rot_13.py | 107 | ||||
-rw-r--r-- | Lib/encodings/uu_codec.py | 110 | ||||
-rw-r--r-- | Lib/encodings/zlib_codec.py | 61 | ||||
-rw-r--r-- | Lib/test/string_tests.py | 19 |
8 files changed, 434 insertions, 0 deletions
diff --git a/Lib/UserString.py b/Lib/UserString.py index 163faa5..45cdeb5 100755 --- a/Lib/UserString.py +++ b/Lib/UserString.py @@ -72,6 +72,14 @@ class UserString: def center(self, width): return self.__class__(self.data.center(width)) def count(self, sub, start=0, end=sys.maxint): return self.data.count(sub, start, end) + def decode(self, encoding=None, errors=None): # XXX improve this? + if encoding: + if errors: + return self.__class__(self.data.decode(encoding, errors)) + else: + return self.__class__(self.data.decode(encoding)) + else: + return self.__class__(self.data.decode()) def encode(self, encoding=None, errors=None): # XXX improve this? if encoding: if errors: diff --git a/Lib/encodings/aliases.py b/Lib/encodings/aliases.py index b9bfd97..5573f6d 100644 --- a/Lib/encodings/aliases.py +++ b/Lib/encodings/aliases.py @@ -79,4 +79,13 @@ aliases = { 'tis260': 'tactis', 'sjis': 'shift_jis', + # Content transfer/compression encodings + 'rot13': 'rot_13', + 'base64': 'base64_codec', + 'base_64': 'base64_codec', + 'zlib': 'zlib_codec', + 'zip': 'zlib_codec', + 'hex': 'hex_codec', + 'uu': 'uu_codec', + } diff --git a/Lib/encodings/base64_codec.py b/Lib/encodings/base64_codec.py new file mode 100644 index 0000000..ce21b1a --- /dev/null +++ b/Lib/encodings/base64_codec.py @@ -0,0 +1,60 @@ +""" Python 'base64_codec' Codec - base64 content transfer encoding + + Unlike most of the other codecs which target Unicode, this codec + will return Python string objects for both encode and decode. + + Written by Marc-Andre Lemburg (mal@lemburg.com). + +""" +import codecs, base64 + +### Codec APIs + +def base64_encode(input,errors='strict'): + + """ Encodes the object input and returns a tuple (output + object, length consumed). + + errors defines the error handling to apply. It defaults to + 'strict' handling which is the only currently supported + error handling for this codec. + + """ + assert errors == 'strict' + output = base64.encodestring(input) + return (output, len(input)) + +def base64_decode(input,errors='strict'): + + """ Decodes the object input and returns a tuple (output + object, length consumed). + + input must be an object which provides the bf_getreadbuf + buffer slot. Python strings, buffer objects and memory + mapped files are examples of objects providing this slot. + + errors defines the error handling to apply. It defaults to + 'strict' handling which is the only currently supported + error handling for this codec. + + """ + assert errors == 'strict' + output = base64.decodestring(input) + return (output, len(input)) + +class Codec(codecs.Codec): + + encode = base64_encode + decode = base64_decode + +class StreamWriter(Codec,codecs.StreamWriter): + pass + +class StreamReader(Codec,codecs.StreamReader): + pass + +### encodings module API + +def getregentry(): + + return (base64_encode,base64_decode,StreamReader,StreamWriter) diff --git a/Lib/encodings/hex_codec.py b/Lib/encodings/hex_codec.py new file mode 100644 index 0000000..ab7d86f --- /dev/null +++ b/Lib/encodings/hex_codec.py @@ -0,0 +1,60 @@ +""" Python 'hex_codec' Codec - 2-digit hex content transfer encoding + + Unlike most of the other codecs which target Unicode, this codec + will return Python string objects for both encode and decode. + + Written by Marc-Andre Lemburg (mal@lemburg.com). + +""" +import codecs, binascii + +### Codec APIs + +def hex_encode(input,errors='strict'): + + """ Encodes the object input and returns a tuple (output + object, length consumed). + + errors defines the error handling to apply. It defaults to + 'strict' handling which is the only currently supported + error handling for this codec. + + """ + assert errors == 'strict' + output = binascii.b2a_hex(input) + return (output, len(input)) + +def hex_decode(input,errors='strict'): + + """ Decodes the object input and returns a tuple (output + object, length consumed). + + input must be an object which provides the bf_getreadbuf + buffer slot. Python strings, buffer objects and memory + mapped files are examples of objects providing this slot. + + errors defines the error handling to apply. It defaults to + 'strict' handling which is the only currently supported + error handling for this codec. + + """ + assert errors == 'strict' + output = binascii.a2b_hex(input) + return (output, len(input)) + +class Codec(codecs.Codec): + + encode = hex_encode + decode = hex_decode + +class StreamWriter(Codec,codecs.StreamWriter): + pass + +class StreamReader(Codec,codecs.StreamReader): + pass + +### encodings module API + +def getregentry(): + + return (hex_encode,hex_decode,StreamReader,StreamWriter) diff --git a/Lib/encodings/rot_13.py b/Lib/encodings/rot_13.py new file mode 100644 index 0000000..8c54811 --- /dev/null +++ b/Lib/encodings/rot_13.py @@ -0,0 +1,107 @@ +#!/usr/local/bin/python2.1 +""" Python Character Mapping Codec for ROT13. + + See http://ucsub.colorado.edu/~kominek/rot13/ for details. + + Written by Marc-Andre Lemburg (mal@lemburg.com). + +"""#" + +import codecs + +### Codec APIs + +class Codec(codecs.Codec): + + def encode(self,input,errors='strict'): + + return codecs.charmap_encode(input,errors,encoding_map) + + def decode(self,input,errors='strict'): + + return codecs.charmap_decode(input,errors,decoding_map) + +class StreamWriter(Codec,codecs.StreamWriter): + pass + +class StreamReader(Codec,codecs.StreamReader): + pass + +### encodings module API + +def getregentry(): + + return (Codec().encode,Codec().decode,StreamReader,StreamWriter) + +### Decoding Map + +decoding_map = codecs.make_identity_dict(range(256)) +decoding_map.update({ + 0x0041: 0x004e, + 0x0042: 0x004f, + 0x0043: 0x0050, + 0x0044: 0x0051, + 0x0045: 0x0052, + 0x0046: 0x0053, + 0x0047: 0x0054, + 0x0048: 0x0055, + 0x0049: 0x0056, + 0x004a: 0x0057, + 0x004b: 0x0058, + 0x004c: 0x0059, + 0x004d: 0x005a, + 0x004e: 0x0041, + 0x004f: 0x0042, + 0x0050: 0x0043, + 0x0051: 0x0044, + 0x0052: 0x0045, + 0x0053: 0x0046, + 0x0054: 0x0047, + 0x0055: 0x0048, + 0x0056: 0x0049, + 0x0057: 0x004a, + 0x0058: 0x004b, + 0x0059: 0x004c, + 0x005a: 0x004d, + 0x0061: 0x006e, + 0x0062: 0x006f, + 0x0063: 0x0070, + 0x0064: 0x0071, + 0x0065: 0x0072, + 0x0066: 0x0073, + 0x0067: 0x0074, + 0x0068: 0x0075, + 0x0069: 0x0076, + 0x006a: 0x0077, + 0x006b: 0x0078, + 0x006c: 0x0079, + 0x006d: 0x007a, + 0x006e: 0x0061, + 0x006f: 0x0062, + 0x0070: 0x0063, + 0x0071: 0x0064, + 0x0072: 0x0065, + 0x0073: 0x0066, + 0x0074: 0x0067, + 0x0075: 0x0068, + 0x0076: 0x0069, + 0x0077: 0x006a, + 0x0078: 0x006b, + 0x0079: 0x006c, + 0x007a: 0x006d, +}) + +### Encoding Map + +encoding_map = {} +for k,v in decoding_map.items(): + encoding_map[v] = k + +### Filter API + +def rot13(infile, outfile): + outfile.write(infile.read().encode('rot-13')) + +if __name__ == '__main__': + import sys + rot13(sys.stdin, sys.stdout) diff --git a/Lib/encodings/uu_codec.py b/Lib/encodings/uu_codec.py new file mode 100644 index 0000000..82e799c --- /dev/null +++ b/Lib/encodings/uu_codec.py @@ -0,0 +1,110 @@ +""" Python 'uu_codec' Codec - UU content transfer encoding + + Unlike most of the other codecs which target Unicode, this codec + will return Python string objects for both encode and decode. + + Written by Marc-Andre Lemburg (mal@lemburg.com). Some details were + adapted from uu.py which was written by Lance Ellinghouse and + modified by Jack Jansen and Fredrik Lundh. + +""" +import codecs, binascii + +### Codec APIs + +def uu_encode(input,errors='strict',filename='<data>',mode=0666): + + """ Encodes the object input and returns a tuple (output + object, length consumed). + + errors defines the error handling to apply. It defaults to + 'strict' handling which is the only currently supported + error handling for this codec. + + """ + assert errors == 'strict' + from cStringIO import StringIO + from binascii import b2a_uu + infile = StringIO(input) + outfile = StringIO() + read = infile.read + write = outfile.write + + # Encode + write('begin %o %s\n' % (mode & 0777, filename)) + chunk = read(45) + while chunk: + write(b2a_uu(chunk)) + chunk = read(45) + write(' \nend\n') + + return (outfile.getvalue(), len(input)) + +def uu_decode(input,errors='strict'): + + """ Decodes the object input and returns a tuple (output + object, length consumed). + + input must be an object which provides the bf_getreadbuf + buffer slot. Python strings, buffer objects and memory + mapped files are examples of objects providing this slot. + + errors defines the error handling to apply. It defaults to + 'strict' handling which is the only currently supported + error handling for this codec. + + Note: filename and file mode information in the input data is + ignored. + + """ + assert errors == 'strict' + from cStringIO import StringIO + from binascii import a2b_uu + infile = StringIO(input) + outfile = StringIO() + readline = infile.readline + write = outfile.write + + # Find start of encoded data + while 1: + s = readline() + if not s: + raise ValueError, 'Missing "begin" line in input data' + if s[:5] == 'begin': + break + + # Decode + while 1: + s = readline() + if not s or \ + s == 'end\n': + break + try: + data = a2b_uu(s) + except binascii.Error, v: + # Workaround for broken uuencoders by /Fredrik Lundh + nbytes = (((ord(s[0])-32) & 63) * 4 + 5) / 3 + data = a2b_uu(s[:nbytes]) + #sys.stderr.write("Warning: %s\n" % str(v)) + write(data) + if not s: + raise ValueError, 'Truncated input data' + + return (outfile.getvalue(), len(input)) + +class Codec(codecs.Codec): + + encode = uu_encode + decode = uu_decode + +class StreamWriter(Codec,codecs.StreamWriter): + pass + +class StreamReader(Codec,codecs.StreamReader): + pass + +### encodings module API + +def getregentry(): + + return (uu_encode,uu_decode,StreamReader,StreamWriter) diff --git a/Lib/encodings/zlib_codec.py b/Lib/encodings/zlib_codec.py new file mode 100644 index 0000000..035bb04 --- /dev/null +++ b/Lib/encodings/zlib_codec.py @@ -0,0 +1,61 @@ +""" Python 'zlib_codec' Codec - zlib compression encoding + + Unlike most of the other codecs which target Unicode, this codec + will return Python string objects for both encode and decode. + + Written by Marc-Andre Lemburg (mal@lemburg.com). + +""" +import codecs +import zlib # this codec needs the optional zlib module ! + +### Codec APIs + +def zlib_encode(input,errors='strict'): + + """ Encodes the object input and returns a tuple (output + object, length consumed). + + errors defines the error handling to apply. It defaults to + 'strict' handling which is the only currently supported + error handling for this codec. + + """ + assert errors == 'strict' + output = zlib.compress(input) + return (output, len(input)) + +def zlib_decode(input,errors='strict'): + + """ Decodes the object input and returns a tuple (output + object, length consumed). + + input must be an object which provides the bf_getreadbuf + buffer slot. Python strings, buffer objects and memory + mapped files are examples of objects providing this slot. + + errors defines the error handling to apply. It defaults to + 'strict' handling which is the only currently supported + error handling for this codec. + + """ + assert errors == 'strict' + output = zlib.decompress(input) + return (output, len(input)) + +class Codec(codecs.Codec): + + encode = zlib_encode + decode = zlib_decode + +class StreamWriter(Codec,codecs.StreamWriter): + pass + +class StreamReader(Codec,codecs.StreamReader): + pass + +### encodings module API + +def getregentry(): + + return (zlib_encode,zlib_decode,StreamReader,StreamWriter) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index fcce50f..9b95a8e 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -1,6 +1,7 @@ """Common tests shared by test_string and test_userstring""" import string +from test_support import verify, verbose, TestFailed transtable = '\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037 !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`xyzdefghijklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377' @@ -212,3 +213,21 @@ def run_method_tests(test): test('endswith', 'helloworld', 0, 'lowo', 3, 8) test('endswith', 'ab', 0, 'ab', 0, 1) test('endswith', 'ab', 0, 'ab', 0, 0) + + # Encoding/decoding + codecs = [('rot13', 'uryyb jbeyq'), + ('base64', 'aGVsbG8gd29ybGQ=\n'), + ('hex', '68656c6c6f20776f726c64'), + ('uu', 'begin 666 <data>\n+:&5L;&\\@=V]R;&0 \n \nend\n')] + for encoding, data in codecs: + test('encode', 'hello world', data, encoding) + test('decode', data, 'hello world', encoding) + # zlib is optional, so we make the test optional too... + try: + import zlib + except ImportError: + pass + else: + data = 'x\x9c\xcbH\xcd\xc9\xc9W(\xcf/\xcaI\x01\x00\x1a\x0b\x04]' + verify('hello world'.encode('zlib') == data) + verify(data.decode('zlib') == 'hello world') |