diff options
Diffstat (limited to 'Lib/codecs.py')
-rw-r--r-- | Lib/codecs.py | 414 |
1 files changed, 414 insertions, 0 deletions
diff --git a/Lib/codecs.py b/Lib/codecs.py new file mode 100644 index 0000000..999f3a9 --- /dev/null +++ b/Lib/codecs.py @@ -0,0 +1,414 @@ +""" codecs -- Python Codec Registry, API and helpers. + + +Written by Marc-Andre Lemburg (mal@lemburg.com). + +(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. + +"""#" + +import struct,types,__builtin__ + +### Registry and builtin stateless codec functions + +from _codecs import * + +### Constants + +# +# Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE) +# +BOM = struct.pack('=H',0xFEFF) +# +BOM_BE = BOM32_BE = '\376\377' +# corresponds to Unicode U+FEFF in UTF-16 on big endian +# platforms == ZERO WIDTH NO-BREAK SPACE +BOM_LE = BOM32_LE = '\377\376' +# corresponds to Unicode U+FFFE in UTF-16 on little endian +# platforms == defined as being an illegal Unicode character + +# +# 64-bit Byte Order Marks +# +BOM64_BE = '\000\000\376\377' +# corresponds to Unicode U+0000FEFF in UCS-4 +BOM64_LE = '\377\376\000\000' +# corresponds to Unicode U+0000FFFE in UCS-4 + + +### Codec base classes (defining the API) + +class Codec: + + """ Defines the interface for stateless encoders/decoders. + + The .encode()/.decode() methods may implement different error + handling schemes by providing the errors argument. These + string values are defined: + + 'strict' - raise an error (or a subclass) + 'ignore' - ignore the character and continue with the next + 'replace' - replace with a suitable replacement character; + Python will use the official U+FFFD REPLACEMENT + CHARACTER for the builtin Unicode codecs. + + """ + def encode(self,input,errors='strict'): + + """ Encodes the object intput and returns a tuple (output + object, length consumed). + + errors defines the error handling to apply. It defaults to + 'strict' handling. + + The method may not store state in the Codec instance. Use + StreamCodec for codecs which have to keep state in order to + make encoding/decoding efficient. + + The encoder must be able to handle zero length input and + return an empty object of the output object type in this + situation. + + """ + raise NotImplementedError + + def decode(self,input,errors='strict'): + + """ Decodes the object input and returns a tuple (output + object, length consumed). + + input must be an object which provides the bf_getreadbuf + buffer slot. Python strings, buffer objects and memory + mapped files are examples of objects providing this slot. + + errors defines the error handling to apply. It defaults to + 'strict' handling. + + The method may not store state in the Codec instance. Use + StreamCodec for codecs which have to keep state in order to + make encoding/decoding efficient. + + The decoder must be able to handle zero length input and + return an empty object of the output object type in this + situation. + + """ + raise NotImplementedError + +# +# The StreamWriter and StreamReader class provide generic working +# interfaces which can be used to implement new encodings submodules +# very easily. See encodings/utf_8.py for an example on how this is +# done. +# + +class StreamWriter(Codec): + + def __init__(self,stream,errors='strict'): + + """ Creates a StreamWriter instance. + + stream must be a file-like object open for writing + (binary) data. + + The StreamWriter may implement different error handling + schemes by providing the errors keyword argument. These + parameters are defined: + + 'strict' - raise a ValueError (or a subclass) + 'ignore' - ignore the character and continue with the next + 'replace'- replace with a suitable replacement character + + """ + self.stream = stream + self.errors = errors + + def write(self,object): + + """ Writes the object's contents encoded to self.stream. + """ + data, consumed = self.encode(object,self.errors) + self.stream.write(data) + + # XXX .writelines() ? + + def reset(self): + + """ Flushes and resets the codec buffers used for keeping state. + + Calling this method should ensure that the data on the + output is put into a clean state, that allows appending + of new fresh data without having to rescan the whole + stream to recover state. + + """ + pass + + def __getattr__(self,name, + + getattr=getattr): + + """ Inherit all other methods from the underlying stream. + """ + return getattr(self.stream,name) + +### + +class StreamReader(Codec): + + def __init__(self,stream,errors='strict'): + + """ Creates a StreamReader instance. + + stream must be a file-like object open for reading + (binary) data. + + The StreamReader may implement different error handling + schemes by providing the errors keyword argument. These + parameters are defined: + + 'strict' - raise a ValueError (or a subclass) + 'ignore' - ignore the character and continue with the next + 'replace'- replace with a suitable replacement character; + + """ + self.stream = stream + self.errors = errors + + def read(self,size=-1): + + """ Decodes data from the stream self.stream and returns the + resulting object. + + size indicates the approximate maximum number of bytes to + read from the stream for decoding purposes. The decoder + can modify this setting as appropriate. The default value + -1 indicates to read and decode as much as possible. size + is intended to prevent having to decode huge files in one + step. + + The method should use a greedy read strategy meaning that + it should read as much data as is allowed within the + definition of the encoding and the given size, e.g. if + optional encoding endings or state markers are available + on the stream, these should be read too. + + """ + # Unsliced reading: + if size < 0: + return self.decode(self.stream.read())[0] + + # Sliced reading: + read = self.stream.read + decode = self.decode + data = read(size) + i = 0 + while 1: + try: + object, decodedbytes = decode(data) + except ValueError,why: + # This method is slow but should work under pretty much + # all conditions; at most 10 tries are made + i = i + 1 + newdata = read(1) + if not newdata or i > 10: + raise + data = data + newdata + else: + return object + + # XXX .readline() and .readlines() (these are hard to implement + # without using buffers for keeping read-ahead data) + + def reset(self): + + """ Resets the codec buffers used for keeping state. + + Note that no stream repositioning should take place. + This method is primarely intended to be able to recover + from decoding errors. + + """ + pass + + def __getattr__(self,name, + + getattr=getattr): + + """ Inherit all other methods from the underlying stream. + """ + return getattr(self.stream,name) + +### + +class StreamReaderWriter: + + def __init__(self,stream,Reader,Writer,errors='strict'): + + """ Creates a StreamReaderWriter instance. + + stream must be a Stream-like object. + + Reader, Writer must be factory functions or classes + providing the StreamReader, StreamWriter interface resp. + + Error handling is done in the same way as defined for the + StreamWriter/Readers. + + """ + self.stream = stream + self.reader = Reader(stream, errors) + self.writer = Writer(stream, errors) + self.errors = errors + + def read(self,size=-1): + + return self.reader.read(size) + + def write(self,data): + + return self.writer.write(data) + + def reset(self): + + self.reader.reset() + self.writer.reset() + + def __getattr__(self,name, + + getattr=getattr): + + """ Inherit all other methods from the underlying stream. + """ + return getattr(self.stream,name) + +### + +class StreamRecoder: + + def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'): + + """ Creates a StreamRecoder instance which implements a two-way + conversion: encode and decode work on the frontend (the + input to .read() and output of .write()) while + Reader and Writer work on the backend (reading and + writing to the the stream). + + You can use these objects to do transparent direct + recodings from e.g. latin-1 to utf-8 and back. + + stream must be a file-like object. + + encode, decode must adhere to the Codec interface, Reader, + Writer must be factory functions or classes providing the + StreamReader, StreamWriter interface resp. + + encode and decode are needed for the frontend translation, + Reader and Writer for the backend translation. Unicode is + used as intermediate encoding. + + Error handling is done in the same way as defined for the + StreamWriter/Readers. + + """ + self.stream = stream + self.encode = encode + self.decode = decode + self.reader = Reader(stream, errors) + self.writer = Writer(stream, errors) + self.errors = errors + + def read(self,size=-1): + + data = self.reader.read(size) + data, bytesencoded = self.encode(data, self.errors) + return data + + def write(self,data): + + data, bytesdecoded = self.decode(data, self.errors) + return self.writer.write(data) + + # .writelines(), .readline() and .readlines() ... see notes + # above. + + def reset(self): + + self.reader.reset() + self.writer.reset() + + def __getattr__(self,name, + + getattr=getattr): + + """ Inherit all other methods from the underlying stream. + """ + return getattr(self.stream,name) + +### Shortcuts + +def open(filename, mode, encoding=None, errors='strict', buffering=1): + + """ Open an encoded file using the given mode and return + a wrapped version providing transparent encoding/decoding. + + Note: The wrapped version will only accept the object format + defined by the codecs, i.e. Unicode objects for most builtin + codecs. Output is also codec dependent and will usually by + Unicode as well. + + encoding specifies the encoding which is to be used for the + the file. + + errors may be given to define the error handling. It defaults + to 'strict' which causes ValueErrors to be raised in case an + encoding error occurs. + + buffering has the same meaning as for the builtin open() API. + It defaults to line buffered. + + """ + if encoding is not None and \ + 'b' not in mode: + # Force opening of the file in binary mode + mode = mode + 'b' + file = __builtin__.open(filename, mode, buffering) + if encoding is None: + return file + (e,d,sr,sw) = lookup(encoding) + return StreamReaderWriter(file, sr, sw, errors) + +def EncodedFile(file, input, output=None, errors='strict'): + + """ Return a wrapped version of file which provides transparent + encoding translation. + + Strings written to the wrapped file are interpreted according + to the given input encoding and then written to the original + file as string using the output encoding. The intermediate + encoding will usually be Unicode but depends on the specified + codecs. + + If output is not given, it defaults to input. + + errors may be given to define the error handling. It defaults + to 'strict' which causes ValueErrors to be raised in case an + encoding error occurs. + + """ + if output is None: + output = input + encode, decode = lookup(input)[:2] + Reader, Writer = lookup(output)[2:] + return StreamRecoder(file, + encode,decode,Reader,Writer, + errors) + +### Tests + +if __name__ == '__main__': + + import sys + + # Make stdout translate Latin-1 into Unicode-Escape + sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'unicode-escape') |