SF patch #998993: The UTF-8 and the UTF-16 stateful decoders now support

decoding incomplete input (when the input stream is temporarily exhausted). codecs.StreamReader now implements buffering, which enables proper readline support for the UTF-16 decoders. codecs.StreamReader.read() has a new argument chars which specifies the number of characters to return. codecs.StreamReader.readline() and codecs.StreamReader.readlines() have a new argument keepends. Trailing "\n"s will be stripped from the lines if keepends is false. Added C APIs PyUnicode_DecodeUTF8Stateful and PyUnicode_DecodeUTF16Stateful.
author: Walter Dörwald <walter@livinglogic.de> 2004-09-07 20:24:22 (GMT)
committer: Walter Dörwald <walter@livinglogic.de> 2004-09-07 20:24:22 (GMT)
commit: 69652035bc2cf22b0326bb00824f4b7e2674cc8b (patch)
tree: 088104a47f9c9cfc466a3e1c5f4d2560b2d41450 /Doc
parent: a708d6e3b0aa2d225d4e5ab338862f67994e1c45 (diff)
download: cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.zip
cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.tar.gz
cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.tar.bz2
2 files changed, 41 insertions, 11 deletions
diff --git a/Doc/api/concrete.tex b/Doc/api/concrete.tex
index a77a584..96b4faf 100644
--- a/Doc/api/concrete.tex
+++ b/Doc/api/concrete.tex
@@ -1076,6 +1076,17 @@ These are the UTF-8 codec APIs:
   by the codec.
 \end{cfuncdesc}
 
+\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeUTF8Stateful}{const char *s,
+                                               int size,
+                                               const char *errors,
+                                               int *consumed}
+  If \var{consumed} is \NULL{}, behaves like \cfunction{PyUnicode_DecodeUTF8()}.
+  If \var{consumed} is not \NULL{}, trailing incomplete UTF-8 byte sequences
+  will not be treated as an error. Those bytes will not be decoded and the
+  number of bytes that have been decoded will be stored in \var{consumed}.
+  \versionadded{2.4}
+\end{cfuncdesc}
+
 \begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeUTF8}{const Py_UNICODE *s,
                                                int size,
                                                const char *errors}
@@ -1121,6 +1132,20 @@ These are the UTF-16 codec APIs:
   Returns \NULL{} if an exception was raised by the codec.
 \end{cfuncdesc}
 
+\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeUTF16Stateful}{const char *s,
+                                               int size,
+                                               const char *errors,
+                                               int *byteorder,
+                                               int *consumed}
+  If \var{consumed} is \NULL{}, behaves like
+  \cfunction{PyUnicode_DecodeUTF16()}. If \var{consumed} is not \NULL{},
+  \cfunction{PyUnicode_DecodeUTF16Stateful()} will not treat trailing incomplete
+  UTF-16 byte sequences (i.e. an odd number of bytes or a split surrogate pair)
+  as an error. Those bytes will not be decoded and the number of bytes that
+  have been decoded will be stored in \var{consumed}.
+  \versionadded{2.4}
+\end{cfuncdesc}
+
 \begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeUTF16}{const Py_UNICODE *s,
                                                int size,
                                                const char *errors,
diff --git a/Doc/lib/libcodecs.tex b/Doc/lib/libcodecs.tex
index a6c434a..125463b 100644
--- a/Doc/lib/libcodecs.tex
+++ b/Doc/lib/libcodecs.tex
@@ -394,9 +394,14 @@ order to be compatible to the Python codec registry.
   be extended with \function{register_error()}.
 \end{classdesc}
 
-\begin{methoddesc}{read}{\optional{size}}
+\begin{methoddesc}{read}{\optional{size\optional{, chars}}}
   Decodes data from the stream and returns the resulting object.
 
+  \var{chars} indicates the number of characters to read from the
+  stream. \function{read()} will never return more than \vars{chars}
+  characters, but it might return less, if there are not enough
+  characters available.
+
   \var{size} indicates the approximate maximum number of bytes to read
   from the stream for decoding purposes. The decoder can modify this
   setting as appropriate. The default value -1 indicates to read and
@@ -407,29 +412,29 @@ order to be compatible to the Python codec registry.
   read as much data as is allowed within the definition of the encoding
   and the given size, e.g.  if optional encoding endings or state
   markers are available on the stream, these should be read too.
+
+  \versionchanged[\var{chars} argument added]{2.4}
 \end{methoddesc}
 
-\begin{methoddesc}{readline}{[size]}
+\begin{methoddesc}{readline}{\optional{size\optional{, keepends}}}
   Read one line from the input stream and return the
   decoded data.
 
-  Unlike the \method{readlines()} method, this method inherits
-  the line breaking knowledge from the underlying stream's
-  \method{readline()} method -- there is currently no support for line
-  breaking using the codec decoder due to lack of line buffering.
-  Sublcasses should however, if possible, try to implement this method
-  using their own knowledge of line breaking.
-
   \var{size}, if given, is passed as size argument to the stream's
   \method{readline()} method.
+
+  If \var{keepends} is false lineends will be stripped from the
+  lines returned.
+
+  \versionchanged[\var{keepends} argument added]{2.4}
 \end{methoddesc}
 
-\begin{methoddesc}{readlines}{[sizehint]}
+\begin{methoddesc}{readlines}{\optional{sizehint\optional{, keepends}}}
   Read all lines available on the input stream and return them as list
   of lines.
 
   Line breaks are implemented using the codec's decoder method and are
-  included in the list entries.
+  included in the list entries if \var{keepends} is true.
 
   \var{sizehint}, if given, is passed as \var{size} argument to the
   stream's \method{read()} method.
author	Walter Dörwald <walter@livinglogic.de>	2004-09-07 20:24:22 (GMT)
committer	Walter Dörwald <walter@livinglogic.de>	2004-09-07 20:24:22 (GMT)
commit	69652035bc2cf22b0326bb00824f4b7e2674cc8b (patch)
tree	088104a47f9c9cfc466a3e1c5f4d2560b2d41450 /Doc
parent	a708d6e3b0aa2d225d4e5ab338862f67994e1c45 (diff)
download	cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.zip cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.tar.gz cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.tar.bz2