diff options
author | Thomas Wouters <thomas@python.org> | 2006-04-21 09:43:23 (GMT) |
---|---|---|
committer | Thomas Wouters <thomas@python.org> | 2006-04-21 09:43:23 (GMT) |
commit | a977329b6fb0e4c95cabb9043794de69b27a1099 (patch) | |
tree | b91552a0578639bd10181ab612039c1bed9bec27 /Lib/codecs.py | |
parent | d858f70617a9df8456e89a898ad8f97bd57c09f9 (diff) | |
download | cpython-a977329b6fb0e4c95cabb9043794de69b27a1099.zip cpython-a977329b6fb0e4c95cabb9043794de69b27a1099.tar.gz cpython-a977329b6fb0e4c95cabb9043794de69b27a1099.tar.bz2 |
Merge part of the trunk changes into the p3yk branch. This merges from 43030
(branch-creation time) up to 43067. 43068 and 43069 contain a little
swapping action between re.py and sre.py, and this mightily confuses svn
merge, so later changes are going in separately.
This merge should break no additional tests.
The last-merged revision is going in a 'last_merge' property on '.' (the
branch directory.) Arbitrarily chosen, really; if there's a BCP for this, I
couldn't find it, but we can easily change it afterwards ;)
Diffstat (limited to 'Lib/codecs.py')
-rw-r--r-- | Lib/codecs.py | 183 |
1 files changed, 172 insertions, 11 deletions
diff --git a/Lib/codecs.py b/Lib/codecs.py index 6895a22..28856c7 100644 --- a/Lib/codecs.py +++ b/Lib/codecs.py @@ -73,6 +73,23 @@ BOM64_BE = BOM_UTF32_BE ### Codec base classes (defining the API) +class CodecInfo(tuple): + + def __new__(cls, encode, decode, streamreader=None, streamwriter=None, + incrementalencoder=None, incrementaldecoder=None, name=None): + self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) + self.name = name + self.encode = encode + self.decode = decode + self.incrementalencoder = incrementalencoder + self.incrementaldecoder = incrementaldecoder + self.streamwriter = streamwriter + self.streamreader = streamreader + return self + + def __repr__(self): + return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self)) + class Codec: """ Defines the interface for stateless encoders/decoders. @@ -137,6 +154,88 @@ class Codec: """ raise NotImplementedError +class IncrementalEncoder(object): + """ + A IncrementalEncoder encodes an input in multiple steps. The input can be + passed piece by piece to the encode() method. The IncrementalEncoder remembers + the state of the Encoding process between calls to encode(). + """ + def __init__(self, errors='strict'): + """ + Creates a IncrementalEncoder instance. + + The IncrementalEncoder may use different error handling schemes by + providing the errors keyword argument. See the module docstring + for a list of possible values. + """ + self.errors = errors + self.buffer = "" + + def encode(self, input, final=False): + """ + Encodes input and returns the resulting object. + """ + raise NotImplementedError + + def reset(self): + """ + Resets the encoder to the initial state. + """ + +class IncrementalDecoder(object): + """ + An IncrementalDecoder decodes an input in multiple steps. The input can be + passed piece by piece to the decode() method. The IncrementalDecoder + remembers the state of the decoding process between calls to decode(). + """ + def __init__(self, errors='strict'): + """ + Creates a IncrementalDecoder instance. + + The IncrementalDecoder may use different error handling schemes by + providing the errors keyword argument. See the module docstring + for a list of possible values. + """ + self.errors = errors + + def decode(self, input, final=False): + """ + Decodes input and returns the resulting object. + """ + raise NotImplementedError + + def reset(self): + """ + Resets the decoder to the initial state. + """ + +class BufferedIncrementalDecoder(IncrementalDecoder): + """ + This subclass of IncrementalDecoder can be used as the baseclass for an + incremental decoder if the decoder must be able to handle incomplete byte + sequences. + """ + def __init__(self, errors='strict'): + IncrementalDecoder.__init__(self, errors) + self.buffer = "" # undecoded input that is kept between calls to decode() + + def _buffer_decode(self, input, errors, final): + # Overwrite this method in subclasses: It must decode input + # and return an (output, length consumed) tuple + raise NotImplementedError + + def decode(self, input, final=False): + # decode input (taking the buffer into account) + data = self.buffer + input + (result, consumed) = self._buffer_decode(data, self.errors, final) + # keep undecoded input until the next call + self.buffer = data[consumed:] + return result + + def reset(self): + IncrementalDecoder.reset(self) + self.bytebuffer = "" + # # The StreamWriter and StreamReader class provide generic working # interfaces which can be used to implement new encoding submodules @@ -666,8 +765,8 @@ def open(filename, mode='rb', encoding=None, errors='strict', buffering=1): file = __builtin__.open(filename, mode, buffering) if encoding is None: return file - (e, d, sr, sw) = lookup(encoding) - srw = StreamReaderWriter(file, sr, sw, errors) + info = lookup(encoding) + srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors) # Add attributes to simplify introspection srw.encoding = encoding return srw @@ -699,11 +798,9 @@ def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): """ if file_encoding is None: file_encoding = data_encoding - encode, decode = lookup(data_encoding)[:2] - Reader, Writer = lookup(file_encoding)[2:] - sr = StreamRecoder(file, - encode, decode, Reader, Writer, - errors) + info = lookup(data_encoding) + sr = StreamRecoder(file, info.encode, info.decode, + info.streamreader, info.streamwriter, errors) # Add attributes to simplify introspection sr.data_encoding = data_encoding sr.file_encoding = file_encoding @@ -719,7 +816,7 @@ def getencoder(encoding): Raises a LookupError in case the encoding cannot be found. """ - return lookup(encoding)[0] + return lookup(encoding).encode def getdecoder(encoding): @@ -729,7 +826,35 @@ def getdecoder(encoding): Raises a LookupError in case the encoding cannot be found. """ - return lookup(encoding)[1] + return lookup(encoding).decode + +def getincrementalencoder(encoding): + + """ Lookup up the codec for the given encoding and return + its IncrementalEncoder class or factory function. + + Raises a LookupError in case the encoding cannot be found + or the codecs doesn't provide an incremental encoder. + + """ + encoder = lookup(encoding).incrementalencoder + if encoder is None: + raise LookupError(encoding) + return encoder + +def getincrementaldecoder(encoding): + + """ Lookup up the codec for the given encoding and return + its IncrementalDecoder class or factory function. + + Raises a LookupError in case the encoding cannot be found + or the codecs doesn't provide an incremental decoder. + + """ + decoder = lookup(encoding).incrementaldecoder + if decoder is None: + raise LookupError(encoding) + return decoder def getreader(encoding): @@ -739,7 +864,7 @@ def getreader(encoding): Raises a LookupError in case the encoding cannot be found. """ - return lookup(encoding)[2] + return lookup(encoding).streamreader def getwriter(encoding): @@ -749,7 +874,43 @@ def getwriter(encoding): Raises a LookupError in case the encoding cannot be found. """ - return lookup(encoding)[3] + return lookup(encoding).streamwriter + +def iterencode(iterator, encoding, errors='strict', **kwargs): + """ + Encoding iterator. + + Encodes the input strings from the iterator using a IncrementalEncoder. + + errors and kwargs are passed through to the IncrementalEncoder + constructor. + """ + encoder = getincrementalencoder(encoding)(errors, **kwargs) + for input in iterator: + output = encoder.encode(input) + if output: + yield output + output = encoder.encode("", True) + if output: + yield output + +def iterdecode(iterator, encoding, errors='strict', **kwargs): + """ + Decoding iterator. + + Decodes the input strings from the iterator using a IncrementalDecoder. + + errors and kwargs are passed through to the IncrementalDecoder + constructor. + """ + decoder = getincrementaldecoder(encoding)(errors, **kwargs) + for input in iterator: + output = decoder.decode(input) + if output: + yield output + output = decoder.decode("", True) + if output: + yield output ### Helpers for charmap-based codecs |