summaryrefslogtreecommitdiffstats
path: root/Lib/encodings
diff options
context:
space:
mode:
authorWalter Dörwald <walter@livinglogic.de>2004-09-07 20:24:22 (GMT)
committerWalter Dörwald <walter@livinglogic.de>2004-09-07 20:24:22 (GMT)
commit69652035bc2cf22b0326bb00824f4b7e2674cc8b (patch)
tree088104a47f9c9cfc466a3e1c5f4d2560b2d41450 /Lib/encodings
parenta708d6e3b0aa2d225d4e5ab338862f67994e1c45 (diff)
downloadcpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.zip
cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.tar.gz
cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.tar.bz2
SF patch #998993: The UTF-8 and the UTF-16 stateful decoders now support
decoding incomplete input (when the input stream is temporarily exhausted). codecs.StreamReader now implements buffering, which enables proper readline support for the UTF-16 decoders. codecs.StreamReader.read() has a new argument chars which specifies the number of characters to return. codecs.StreamReader.readline() and codecs.StreamReader.readlines() have a new argument keepends. Trailing "\n"s will be stripped from the lines if keepends is false. Added C APIs PyUnicode_DecodeUTF8Stateful and PyUnicode_DecodeUTF16Stateful.
Diffstat (limited to 'Lib/encodings')
-rw-r--r--Lib/encodings/utf_16.py64
-rw-r--r--Lib/encodings/utf_16_be.py20
-rw-r--r--Lib/encodings/utf_16_le.py19
-rw-r--r--Lib/encodings/utf_8.py18
4 files changed, 49 insertions, 72 deletions
diff --git a/Lib/encodings/utf_16.py b/Lib/encodings/utf_16.py
index 8c79c79..a33581c 100644
--- a/Lib/encodings/utf_16.py
+++ b/Lib/encodings/utf_16.py
@@ -10,54 +10,40 @@ import codecs, sys
### Codec APIs
-class Codec(codecs.Codec):
+encode = codecs.utf_16_encode
- # Note: Binding these as C functions will result in the class not
- # converting them to methods. This is intended.
- encode = codecs.utf_16_encode
- decode = codecs.utf_16_decode
+def decode(input, errors='strict'):
+ return codecs.utf_16_decode(input, errors, True)
-class StreamWriter(Codec,codecs.StreamWriter):
+class StreamWriter(codecs.StreamWriter):
def __init__(self, stream, errors='strict'):
- self.bom_written = 0
+ self.bom_written = False
codecs.StreamWriter.__init__(self, stream, errors)
- def write(self, data):
- result = codecs.StreamWriter.write(self, data)
- if not self.bom_written:
- self.bom_written = 1
- if sys.byteorder == 'little':
- self.encode = codecs.utf_16_le_encode
- else:
- self.encode = codecs.utf_16_be_encode
+ def encode(self, input, errors='strict'):
+ self.bom_written = True
+ result = codecs.utf_16_encode(input, errors)
+ if sys.byteorder == 'little':
+ self.encode = codecs.utf_16_le_encode
+ else:
+ self.encode = codecs.utf_16_be_encode
return result
-class StreamReader(Codec,codecs.StreamReader):
- def __init__(self, stream, errors='strict'):
- self.bom_read = 0
- codecs.StreamReader.__init__(self, stream, errors)
-
- def read(self, size=-1):
- if not self.bom_read:
- signature = self.stream.read(2)
- if signature == codecs.BOM_BE:
- self.decode = codecs.utf_16_be_decode
- elif signature == codecs.BOM_LE:
- self.decode = codecs.utf_16_le_decode
- else:
- raise UnicodeError,"UTF-16 stream does not start with BOM"
- if size > 2:
- size -= 2
- elif size >= 0:
- size = 0
- self.bom_read = 1
- return codecs.StreamReader.read(self, size)
-
- def readline(self, size=None):
- raise NotImplementedError, '.readline() is not implemented for UTF-16'
+class StreamReader(codecs.StreamReader):
+
+ def decode(self, input, errors='strict'):
+ (object, consumed, byteorder) = \
+ codecs.utf_16_ex_decode(input, errors, 0, False)
+ if byteorder == -1:
+ self.decode = codecs.utf_16_le_decode
+ elif byteorder == 1:
+ self.decode = codecs.utf_16_be_decode
+ elif consumed>=2:
+ raise UnicodeError,"UTF-16 stream does not start with BOM"
+ return (object, consumed)
### encodings module API
def getregentry():
- return (Codec.encode,Codec.decode,StreamReader,StreamWriter)
+ return (encode,decode,StreamReader,StreamWriter)
diff --git a/Lib/encodings/utf_16_be.py b/Lib/encodings/utf_16_be.py
index dad540b..9a51f8c 100644
--- a/Lib/encodings/utf_16_be.py
+++ b/Lib/encodings/utf_16_be.py
@@ -10,23 +10,19 @@ import codecs
### Codec APIs
-class Codec(codecs.Codec):
+encode = codecs.utf_16_be_encode
- # Note: Binding these as C functions will result in the class not
- # converting them to methods. This is intended.
- encode = codecs.utf_16_be_encode
- decode = codecs.utf_16_be_decode
-
-class StreamWriter(Codec,codecs.StreamWriter):
- pass
+def decode(input, errors='strict'):
+ return codecs.utf_16_be_decode(input, errors, True)
-class StreamReader(Codec,codecs.StreamReader):
+class StreamWriter(codecs.StreamWriter):
+ encode = codecs.utf_16_be_encode
- def readline(self, size=None):
- raise NotImplementedError, '.readline() is not implemented for UTF-16-BE'
+class StreamReader(codecs.StreamReader):
+ decode = codecs.utf_16_be_decode
### encodings module API
def getregentry():
- return (Codec.encode,Codec.decode,StreamReader,StreamWriter)
+ return (encode,decode,StreamReader,StreamWriter)
diff --git a/Lib/encodings/utf_16_le.py b/Lib/encodings/utf_16_le.py
index 8120d5b..95ca830 100644
--- a/Lib/encodings/utf_16_le.py
+++ b/Lib/encodings/utf_16_le.py
@@ -10,23 +10,20 @@ import codecs
### Codec APIs
-class Codec(codecs.Codec):
+encode = codecs.utf_16_le_encode
- # Note: Binding these as C functions will result in the class not
- # converting them to methods. This is intended.
- encode = codecs.utf_16_le_encode
- decode = codecs.utf_16_le_decode
+def decode(input, errors='strict'):
+ return codecs.utf_16_le_decode(input, errors, True)
-class StreamWriter(Codec,codecs.StreamWriter):
- pass
+class StreamWriter(codecs.StreamWriter):
+ encode = codecs.utf_16_le_encode
-class StreamReader(Codec,codecs.StreamReader):
+class StreamReader(codecs.StreamReader):
+ decode = codecs.utf_16_le_decode
- def readline(self, size=None):
- raise NotImplementedError, '.readline() is not implemented for UTF-16-LE'
### encodings module API
def getregentry():
- return (Codec.encode,Codec.decode,StreamReader,StreamWriter)
+ return (encode,decode,StreamReader,StreamWriter)
diff --git a/Lib/encodings/utf_8.py b/Lib/encodings/utf_8.py
index 89249a9..9cb0b4b 100644
--- a/Lib/encodings/utf_8.py
+++ b/Lib/encodings/utf_8.py
@@ -10,21 +10,19 @@ import codecs
### Codec APIs
-class Codec(codecs.Codec):
+encode = codecs.utf_8_encode
- # Note: Binding these as C functions will result in the class not
- # converting them to methods. This is intended.
- encode = codecs.utf_8_encode
- decode = codecs.utf_8_decode
+def decode(input, errors='strict'):
+ return codecs.utf_8_decode(input, errors, True)
-class StreamWriter(Codec,codecs.StreamWriter):
- pass
+class StreamWriter(codecs.StreamWriter):
+ encode = codecs.utf_8_encode
-class StreamReader(Codec,codecs.StreamReader):
- pass
+class StreamReader(codecs.StreamReader):
+ decode = codecs.utf_8_decode
### encodings module API
def getregentry():
- return (Codec.encode,Codec.decode,StreamReader,StreamWriter)
+ return (encode,decode,StreamReader,StreamWriter)