gh-81322: support multiple separators in StreamReader.readuntil (#16429)

author: Bruce Merry <1963944+bmerry@users.noreply.github.com> 2024-04-08 16:58:02 (GMT)
committer: GitHub <noreply@github.com> 2024-04-08 16:58:02 (GMT)
commit: 775912a51d6847b0e4fe415fa91f2e0b06a3c43c (patch)
tree: 27a4a67ddefe2032621728d85c4b05b3d5e425c4 /Lib/asyncio
parent: 24a2bd048115efae799b0a9c5dd9fbb7a0806978 (diff)
download: cpython-775912a51d6847b0e4fe415fa91f2e0b06a3c43c.zip
cpython-775912a51d6847b0e4fe415fa91f2e0b06a3c43c.tar.gz
cpython-775912a51d6847b0e4fe415fa91f2e0b06a3c43c.tar.bz2
1 files changed, 44 insertions, 21 deletions
diff --git a/Lib/asyncio/streams.py b/Lib/asyncio/streams.py
index 3fe52db..4517ca2 100644
--- a/Lib/asyncio/streams.py
+++ b/Lib/asyncio/streams.py
@@ -590,20 +590,34 @@ class StreamReader:
         If the data cannot be read because of over limit, a
         LimitOverrunError exception  will be raised, and the data
         will be left in the internal buffer, so it can be read again.
+
+        The ``separator`` may also be an iterable of separators. In this
+        case the return value will be the shortest possible that has any
+        separator as the suffix. For the purposes of LimitOverrunError,
+        the shortest possible separator is considered to be the one that
+        matched.
         """
-        seplen = len(separator)
-        if seplen == 0:
+        if isinstance(separator, bytes):
+            separator = [separator]
+        else:
+            # Makes sure shortest matches wins, and supports arbitrary iterables
+            separator = sorted(separator, key=len)
+        if not separator:
+            raise ValueError('Separator should contain at least one element')
+        min_seplen = len(separator[0])
+        max_seplen = len(separator[-1])
+        if min_seplen == 0:
             raise ValueError('Separator should be at least one-byte string')
 
         if self._exception is not None:
             raise self._exception
 
         # Consume whole buffer except last bytes, which length is
-        # one less than seplen. Let's check corner cases with
-        # separator='SEPARATOR':
+        # one less than max_seplen. Let's check corner cases with
+        # separator[-1]='SEPARATOR':
         # * we have received almost complete separator (without last
         #   byte). i.e buffer='some textSEPARATO'. In this case we
-        #   can safely consume len(separator) - 1 bytes.
+        #   can safely consume max_seplen - 1 bytes.
         # * last byte of buffer is first byte of separator, i.e.
         #   buffer='abcdefghijklmnopqrS'. We may safely consume
         #   everything except that last byte, but this require to
@@ -616,26 +630,35 @@ class StreamReader:
         #   messages :)
 
         # `offset` is the number of bytes from the beginning of the buffer
-        # where there is no occurrence of `separator`.
+        # where there is no occurrence of any `separator`.
         offset = 0
 
-        # Loop until we find `separator` in the buffer, exceed the buffer size,
+        # Loop until we find a `separator` in the buffer, exceed the buffer size,
         # or an EOF has happened.
         while True:
             buflen = len(self._buffer)
 
-            # Check if we now have enough data in the buffer for `separator` to
-            # fit.
-            if buflen - offset >= seplen:
-                isep = self._buffer.find(separator, offset)
-
-                if isep != -1:
-                    # `separator` is in the buffer. `isep` will be used later
-                    # to retrieve the data.
+            # Check if we now have enough data in the buffer for shortest
+            # separator to fit.
+            if buflen - offset >= min_seplen:
+                match_start = None
+                match_end = None
+                for sep in separator:
+                    isep = self._buffer.find(sep, offset)
+
+                    if isep != -1:
+                        # `separator` is in the buffer. `match_start` and
+                        # `match_end` will be used later to retrieve the
+                        # data.
+                        end = isep + len(sep)
+                        if match_end is None or end < match_end:
+                            match_end = end
+                            match_start = isep
+                if match_end is not None:
                     break
 
                 # see upper comment for explanation.
-                offset = buflen + 1 - seplen
+                offset = max(0, buflen + 1 - max_seplen)
                 if offset > self._limit:
                     raise exceptions.LimitOverrunError(
                         'Separator is not found, and chunk exceed the limit',
@@ -644,7 +667,7 @@ class StreamReader:
             # Complete message (with full separator) may be present in buffer
             # even when EOF flag is set. This may happen when the last chunk
             # adds data which makes separator be found. That's why we check for
-            # EOF *ater* inspecting the buffer.
+            # EOF *after* inspecting the buffer.
             if self._eof:
                 chunk = bytes(self._buffer)
                 self._buffer.clear()
@@ -653,12 +676,12 @@ class StreamReader:
             # _wait_for_data() will resume reading if stream was paused.
             await self._wait_for_data('readuntil')
 
-        if isep > self._limit:
+        if match_start > self._limit:
             raise exceptions.LimitOverrunError(
-                'Separator is found, but chunk is longer than limit', isep)
+                'Separator is found, but chunk is longer than limit', match_start)
 
-        chunk = self._buffer[:isep + seplen]
-        del self._buffer[:isep + seplen]
+        chunk = self._buffer[:match_end]
+        del self._buffer[:match_end]
         self._maybe_resume_transport()
         return bytes(chunk)
author	Bruce Merry <1963944+bmerry@users.noreply.github.com>	2024-04-08 16:58:02 (GMT)
committer	GitHub <noreply@github.com>	2024-04-08 16:58:02 (GMT)
commit	775912a51d6847b0e4fe415fa91f2e0b06a3c43c (patch)
tree	27a4a67ddefe2032621728d85c4b05b3d5e425c4 /Lib/asyncio
parent	24a2bd048115efae799b0a9c5dd9fbb7a0806978 (diff)
download	cpython-775912a51d6847b0e4fe415fa91f2e0b06a3c43c.zip cpython-775912a51d6847b0e4fe415fa91f2e0b06a3c43c.tar.gz cpython-775912a51d6847b0e4fe415fa91f2e0b06a3c43c.tar.bz2