From 3c0d0baf2badfad7deb346d1043f7d83bb92691f Mon Sep 17 00:00:00 2001 From: Martin Panter Date: Wed, 24 Aug 2016 06:33:33 +0000 Subject: Issue #12319: Support for chunked encoding of HTTP request bodies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the body object is a file, its size is no longer determined with fstat(), since that can report the wrong result (e.g. reading from a pipe). Instead, determine the size using seek(), or fall back to chunked encoding for unseekable files. Also, change the logic for detecting text files to check for TextIOBase inheritance, rather than inspecting the “mode” attribute, which may not exist (e.g. BytesIO and StringIO). The Content-Length for text files is no longer determined ahead of time, because the original logic could have been wrong depending on the codec and newline translation settings. Patch by Demian Brecht and Rolf Krahl, with a few tweaks by me. --- Doc/library/http.client.rst | 98 ++++++++++++++------ Doc/library/urllib.request.rst | 60 ++++++++----- Doc/whatsnew/3.6.rst | 19 ++++ Lib/http/client.py | 199 ++++++++++++++++++++++++++++++----------- Lib/test/test_httplib.py | 151 ++++++++++++++++++++++++++++++- Lib/test/test_urllib2.py | 103 ++++++++++++++++----- Lib/urllib/request.py | 42 ++++----- Misc/ACKS | 1 + Misc/NEWS | 8 ++ 9 files changed, 531 insertions(+), 150 deletions(-) diff --git a/Doc/library/http.client.rst b/Doc/library/http.client.rst index a9ca4b0..9429fb6 100644 --- a/Doc/library/http.client.rst +++ b/Doc/library/http.client.rst @@ -219,39 +219,62 @@ HTTPConnection Objects :class:`HTTPConnection` instances have the following methods: -.. method:: HTTPConnection.request(method, url, body=None, headers={}) +.. method:: HTTPConnection.request(method, url, body=None, headers={}, *, \ + encode_chunked=False) This will send a request to the server using the HTTP request method *method* and the selector *url*. If *body* is specified, the specified data is sent after the headers are - finished. It may be a string, a :term:`bytes-like object`, an open - :term:`file object`, or an iterable of :term:`bytes-like object`\s. If - *body* is a string, it is encoded as ISO-8859-1, the default for HTTP. If - it is a bytes-like object the bytes are sent as is. If it is a :term:`file - object`, the contents of the file is sent; this file object should support - at least the ``read()`` method. If the file object has a ``mode`` - attribute, the data returned by the ``read()`` method will be encoded as - ISO-8859-1 unless the ``mode`` attribute contains the substring ``b``, - otherwise the data returned by ``read()`` is sent as is. If *body* is an - iterable, the elements of the iterable are sent as is until the iterable is - exhausted. - - The *headers* argument should be a mapping of extra HTTP - headers to send with the request. - - If *headers* does not contain a Content-Length item, one is added - automatically if possible. If *body* is ``None``, the Content-Length header - is set to ``0`` for methods that expect a body (``PUT``, ``POST``, and - ``PATCH``). If *body* is a string or bytes object, the Content-Length - header is set to its length. If *body* is a :term:`file object` and it - works to call :func:`~os.fstat` on the result of its ``fileno()`` method, - then the Content-Length header is set to the ``st_size`` reported by the - ``fstat`` call. Otherwise no Content-Length header is added. + finished. It may be a :class:`str`, a :term:`bytes-like object`, an + open :term:`file object`, or an iterable of :class:`bytes`. If *body* + is a string, it is encoded as ISO-8859-1, the default for HTTP. If it + is a bytes-like object, the bytes are sent as is. If it is a :term:`file + object`, the contents of the file is sent; this file object should + support at least the ``read()`` method. If the file object is an + instance of :class:`io.TextIOBase`, the data returned by the ``read()`` + method will be encoded as ISO-8859-1, otherwise the data returned by + ``read()`` is sent as is. If *body* is an iterable, the elements of the + iterable are sent as is until the iterable is exhausted. + + The *headers* argument should be a mapping of extra HTTP headers to send + with the request. + + If *headers* contains neither Content-Length nor Transfer-Encoding, a + Content-Length header will be added automatically if possible. If + *body* is ``None``, the Content-Length header is set to ``0`` for + methods that expect a body (``PUT``, ``POST``, and ``PATCH``). If + *body* is a string or bytes-like object, the Content-Length header is + set to its length. If *body* is a binary :term:`file object` + supporting :meth:`~io.IOBase.seek`, this will be used to determine + its size. Otherwise, the Content-Length header is not added + automatically. In cases where determining the Content-Length up + front is not possible, the body will be chunk-encoded and the + Transfer-Encoding header will automatically be set. + + The *encode_chunked* argument is only relevant if Transfer-Encoding is + specified in *headers*. If *encode_chunked* is ``False``, the + HTTPConnection object assumes that all encoding is handled by the + calling code. If it is ``True``, the body will be chunk-encoded. + + .. note:: + Chunked transfer encoding has been added to the HTTP protocol + version 1.1. Unless the HTTP server is known to handle HTTP 1.1, + the caller must either specify the Content-Length or must use a + body representation whose length can be determined automatically. .. versionadded:: 3.2 *body* can now be an iterable. + .. versionchanged:: 3.6 + If neither Content-Length nor Transfer-Encoding are set in + *headers* and Content-Length cannot be determined, *body* will now + be automatically chunk-encoded. The *encode_chunked* argument + was added. + The Content-Length for binary file objects is determined with seek. + No attempt is made to determine the Content-Length for text file + objects. + .. method:: HTTPConnection.getresponse() Should be called after a request is sent to get the response from the server. @@ -336,13 +359,32 @@ also send your request step by step, by using the four functions below. an argument. -.. method:: HTTPConnection.endheaders(message_body=None) +.. method:: HTTPConnection.endheaders(message_body=None, *, encode_chunked=False) Send a blank line to the server, signalling the end of the headers. The optional *message_body* argument can be used to pass a message body - associated with the request. The message body will be sent in the same - packet as the message headers if it is string, otherwise it is sent in a - separate packet. + associated with the request. + + If *encode_chunked* is ``True``, the result of each iteration of + *message_body* will be chunk-encoded as specified in :rfc:`7230`, + Section 3.3.1. How the data is encoded is dependent on the type of + *message_body*. If *message_body* implements the :ref:`buffer interface + ` the encoding will result in a single chunk. + If *message_body* is a :class:`collections.Iterable`, each iteration + of *message_body* will result in a chunk. If *message_body* is a + :term:`file object`, each call to ``.read()`` will result in a chunk. + The method automatically signals the end of the chunk-encoded data + immediately after *message_body*. + + .. note:: Due to the chunked encoding specification, empty chunks + yielded by an iterator body will be ignored by the chunk-encoder. + This is to avoid premature termination of the read of the request by + the target server due to malformed encoding. + + .. versionadded:: 3.6 + Chunked encoding support. The *encode_chunked* parameter was + added. + .. method:: HTTPConnection.send(data) diff --git a/Doc/library/urllib.request.rst b/Doc/library/urllib.request.rst index 1291aeb..e619cc1 100644 --- a/Doc/library/urllib.request.rst +++ b/Doc/library/urllib.request.rst @@ -30,18 +30,9 @@ The :mod:`urllib.request` module defines the following functions: Open the URL *url*, which can be either a string or a :class:`Request` object. - *data* must be a bytes object specifying additional data to be sent to the - server, or ``None`` if no such data is needed. *data* may also be an - iterable object and in that case Content-Length value must be specified in - the headers. Currently HTTP requests are the only ones that use *data*; the - HTTP request will be a POST instead of a GET when the *data* parameter is - provided. - - *data* should be a buffer in the standard - :mimetype:`application/x-www-form-urlencoded` format. The - :func:`urllib.parse.urlencode` function takes a mapping or sequence of - 2-tuples and returns an ASCII text string in this format. It should - be encoded to bytes before being used as the *data* parameter. + *data* must be an object specifying additional data to be sent to the + server, or ``None`` if no such data is needed. See :class:`Request` + for details. urllib.request module uses HTTP/1.1 and includes ``Connection:close`` header in its HTTP requests. @@ -192,14 +183,22 @@ The following classes are provided: *url* should be a string containing a valid URL. - *data* must be a bytes object specifying additional data to send to the - server, or ``None`` if no such data is needed. Currently HTTP requests are - the only ones that use *data*; the HTTP request will be a POST instead of a - GET when the *data* parameter is provided. *data* should be a buffer in the - standard :mimetype:`application/x-www-form-urlencoded` format. - The :func:`urllib.parse.urlencode` function takes a mapping or sequence of - 2-tuples and returns an ASCII string in this format. It should be - encoded to bytes before being used as the *data* parameter. + *data* must be an object specifying additional data to send to the + server, or ``None`` if no such data is needed. Currently HTTP + requests are the only ones that use *data*. The supported object + types include bytes, file-like objects, and iterables. If no + ``Content-Length`` header has been provided, :class:`HTTPHandler` will + try to determine the length of *data* and set this header accordingly. + If this fails, ``Transfer-Encoding: chunked`` as specified in + :rfc:`7230`, Section 3.3.1 will be used to send the data. See + :meth:`http.client.HTTPConnection.request` for details on the + supported object types and on how the content length is determined. + + For an HTTP POST request method, *data* should be a buffer in the + standard :mimetype:`application/x-www-form-urlencoded` format. The + :func:`urllib.parse.urlencode` function takes a mapping or sequence + of 2-tuples and returns an ASCII string in this format. It should + be encoded to bytes before being used as the *data* parameter. *headers* should be a dictionary, and will be treated as if :meth:`add_header` was called with each key and value as arguments. @@ -211,8 +210,10 @@ The following classes are provided: :mod:`urllib`'s default user agent string is ``"Python-urllib/2.6"`` (on Python 2.6). - An example of using ``Content-Type`` header with *data* argument would be - sending a dictionary like ``{"Content-Type": "application/x-www-form-urlencoded"}``. + An appropriate ``Content-Type`` header should be included if the *data* + argument is present. If this header has not been provided and *data* + is not None, ``Content-Type: application/x-www-form-urlencoded`` will + be added as a default. The final two arguments are only of interest for correct handling of third-party HTTP cookies: @@ -235,15 +236,28 @@ The following classes are provided: *method* should be a string that indicates the HTTP request method that will be used (e.g. ``'HEAD'``). If provided, its value is stored in the :attr:`~Request.method` attribute and is used by :meth:`get_method()`. - Subclasses may indicate a default method by setting the + The default is ``'GET'`` if *data* is ``None`` or ``'POST'`` otherwise. + Subclasses may indicate a different default method by setting the :attr:`~Request.method` attribute in the class itself. + .. note:: + The request will not work as expected if the data object is unable + to deliver its content more than once (e.g. a file or an iterable + that can produce the content only once) and the request is retried + for HTTP redirects or authentication. The *data* is sent to the + HTTP server right away after the headers. There is no support for + a 100-continue expectation in the library. + .. versionchanged:: 3.3 :attr:`Request.method` argument is added to the Request class. .. versionchanged:: 3.4 Default :attr:`Request.method` may be indicated at the class level. + .. versionchanged:: 3.6 + Do not raise an error if the ``Content-Length`` has not been + provided and could not be determined. Fall back to use chunked + transfer encoding instead. .. class:: OpenerDirector() diff --git a/Doc/whatsnew/3.6.rst b/Doc/whatsnew/3.6.rst index 8b85b22..6d5bbc0 100644 --- a/Doc/whatsnew/3.6.rst +++ b/Doc/whatsnew/3.6.rst @@ -324,6 +324,15 @@ exceptions: see :func:`faulthandler.enable`. (Contributed by Victor Stinner in :issue:`23848`.) +http.client +----------- + +:meth:`HTTPConnection.request() ` and +:meth:`~http.client.HTTPConnection.endheaders` both now support +chunked encoding request bodies. +(Contributed by Demian Brecht and Rolf Krahl in :issue:`12319`.) + + idlelib and IDLE ---------------- @@ -500,6 +509,16 @@ The :class:`~unittest.mock.Mock` class has the following improvements: (Contributed by Amit Saha in :issue:`26323`.) +urllib.request +-------------- + +If a HTTP request has a non-empty body but no Content-Length header +and the content length cannot be determined up front, rather than +throwing an error, :class:`~urllib.request.AbstractHTTPHandler` now +falls back to use chunked transfer encoding. +(Contributed by Demian Brecht and Rolf Krahl in :issue:`12319`.) + + urllib.robotparser ------------------ diff --git a/Lib/http/client.py b/Lib/http/client.py index 763e1ef..b242ba6 100644 --- a/Lib/http/client.py +++ b/Lib/http/client.py @@ -795,6 +795,58 @@ class HTTPConnection: auto_open = 1 debuglevel = 0 + @staticmethod + def _is_textIO(stream): + """Test whether a file-like object is a text or a binary stream. + """ + return isinstance(stream, io.TextIOBase) + + @staticmethod + def _get_content_length(body, method): + """Get the content-length based on the body. + + If the body is "empty", we set Content-Length: 0 for methods + that expect a body (RFC 7230, Section 3.3.2). If the body is + set for other methods, we set the header provided we can + figure out what the length is. + """ + if not body: + # do an explicit check for not None here to distinguish + # between unset and set but empty + if method.upper() in _METHODS_EXPECTING_BODY or body is not None: + return 0 + else: + return None + + if hasattr(body, 'read'): + # file-like object. + if HTTPConnection._is_textIO(body): + # text streams are unpredictable because it depends on + # character encoding and line ending translation. + return None + else: + # Is it seekable? + try: + curpos = body.tell() + sz = body.seek(0, io.SEEK_END) + except (TypeError, AttributeError, OSError): + return None + else: + body.seek(curpos) + return sz - curpos + + try: + # does it implement the buffer protocol (bytes, bytearray, array)? + mv = memoryview(body) + return mv.nbytes + except TypeError: + pass + + if isinstance(body, str): + return len(body) + + return None + def __init__(self, host, port=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None): self.timeout = timeout @@ -933,18 +985,9 @@ class HTTPConnection: if hasattr(data, "read") : if self.debuglevel > 0: print("sendIng a read()able") - encode = False - try: - mode = data.mode - except AttributeError: - # io.BytesIO and other file-like objects don't have a `mode` - # attribute. - pass - else: - if "b" not in mode: - encode = True - if self.debuglevel > 0: - print("encoding file using iso-8859-1") + encode = self._is_textIO(data) + if encode and self.debuglevel > 0: + print("encoding file using iso-8859-1") while 1: datablock = data.read(blocksize) if not datablock: @@ -970,7 +1013,22 @@ class HTTPConnection: """ self._buffer.append(s) - def _send_output(self, message_body=None): + def _read_readable(self, readable): + blocksize = 8192 + if self.debuglevel > 0: + print("sendIng a read()able") + encode = self._is_textIO(readable) + if encode and self.debuglevel > 0: + print("encoding file using iso-8859-1") + while True: + datablock = readable.read(blocksize) + if not datablock: + break + if encode: + datablock = datablock.encode("iso-8859-1") + yield datablock + + def _send_output(self, message_body=None, encode_chunked=False): """Send the currently buffered request and clear the buffer. Appends an extra \\r\\n to the buffer. @@ -979,10 +1037,50 @@ class HTTPConnection: self._buffer.extend((b"", b"")) msg = b"\r\n".join(self._buffer) del self._buffer[:] - self.send(msg) + if message_body is not None: - self.send(message_body) + + # create a consistent interface to message_body + if hasattr(message_body, 'read'): + # Let file-like take precedence over byte-like. This + # is needed to allow the current position of mmap'ed + # files to be taken into account. + chunks = self._read_readable(message_body) + else: + try: + # this is solely to check to see if message_body + # implements the buffer API. it /would/ be easier + # to capture if PyObject_CheckBuffer was exposed + # to Python. + memoryview(message_body) + except TypeError: + try: + chunks = iter(message_body) + except TypeError: + raise TypeError("message_body should be a bytes-like " + "object or an iterable, got %r" + % type(message_body)) + else: + # the object implements the buffer interface and + # can be passed directly into socket methods + chunks = (message_body,) + + for chunk in chunks: + if not chunk: + if self.debuglevel > 0: + print('Zero length chunk ignored') + continue + + if encode_chunked and self._http_vsn == 11: + # chunked encoding + chunk = f'{len(chunk):X}\r\n'.encode('ascii') + chunk \ + + b'\r\n' + self.send(chunk) + + if encode_chunked and self._http_vsn == 11: + # end chunked transfer + self.send(b'0\r\n\r\n') def putrequest(self, method, url, skip_host=0, skip_accept_encoding=0): """Send a request to the server. @@ -1135,52 +1233,27 @@ class HTTPConnection: header = header + b': ' + value self._output(header) - def endheaders(self, message_body=None): + def endheaders(self, message_body=None, *, encode_chunked=False): """Indicate that the last header line has been sent to the server. This method sends the request to the server. The optional message_body argument can be used to pass a message body associated with the - request. The message body will be sent in the same packet as the - message headers if it is a string, otherwise it is sent as a separate - packet. + request. """ if self.__state == _CS_REQ_STARTED: self.__state = _CS_REQ_SENT else: raise CannotSendHeader() - self._send_output(message_body) + self._send_output(message_body, encode_chunked=encode_chunked) - def request(self, method, url, body=None, headers={}): + def request(self, method, url, body=None, headers={}, *, + encode_chunked=False): """Send a complete request to the server.""" - self._send_request(method, url, body, headers) - - def _set_content_length(self, body, method): - # Set the content-length based on the body. If the body is "empty", we - # set Content-Length: 0 for methods that expect a body (RFC 7230, - # Section 3.3.2). If the body is set for other methods, we set the - # header provided we can figure out what the length is. - thelen = None - method_expects_body = method.upper() in _METHODS_EXPECTING_BODY - if body is None and method_expects_body: - thelen = '0' - elif body is not None: - try: - thelen = str(len(body)) - except TypeError: - # If this is a file-like object, try to - # fstat its file descriptor - try: - thelen = str(os.fstat(body.fileno()).st_size) - except (AttributeError, OSError): - # Don't send a length if this failed - if self.debuglevel > 0: print("Cannot stat!!") + self._send_request(method, url, body, headers, encode_chunked) - if thelen is not None: - self.putheader('Content-Length', thelen) - - def _send_request(self, method, url, body, headers): + def _send_request(self, method, url, body, headers, encode_chunked): # Honor explicitly requested Host: and Accept-Encoding: headers. - header_names = dict.fromkeys([k.lower() for k in headers]) + header_names = frozenset(k.lower() for k in headers) skips = {} if 'host' in header_names: skips['skip_host'] = 1 @@ -1189,15 +1262,41 @@ class HTTPConnection: self.putrequest(method, url, **skips) + # chunked encoding will happen if HTTP/1.1 is used and either + # the caller passes encode_chunked=True or the following + # conditions hold: + # 1. content-length has not been explicitly set + # 2. the length of the body cannot be determined + # (e.g. it is a generator or unseekable file) + # 3. Transfer-Encoding has NOT been explicitly set by the caller + if 'content-length' not in header_names: - self._set_content_length(body, method) + # only chunk body if not explicitly set for backwards + # compatibility, assuming the client code is already handling the + # chunking + if 'transfer-encoding' not in header_names: + # if content-length cannot be automatically determined, fall + # back to chunked encoding + encode_chunked = False + content_length = self._get_content_length(body, method) + if content_length is None: + if body: + if self.debuglevel > 0: + print('Unable to determine size of %r' % body) + encode_chunked = True + self.putheader('Transfer-Encoding', 'chunked') + else: + self.putheader('Content-Length', str(content_length)) + else: + encode_chunked = False + for hdr, value in headers.items(): self.putheader(hdr, value) if isinstance(body, str): # RFC 2616 Section 3.7.1 says that text default has a # default charset of iso-8859-1. body = _encode(body, 'body') - self.endheaders(body) + self.endheaders(body, encode_chunked=encode_chunked) def getresponse(self): """Get the response from the server. diff --git a/Lib/test/test_httplib.py b/Lib/test/test_httplib.py index 1768a34..a179612 100644 --- a/Lib/test/test_httplib.py +++ b/Lib/test/test_httplib.py @@ -314,6 +314,124 @@ class HeaderTests(TestCase): conn.putheader(name, value) +class TransferEncodingTest(TestCase): + expected_body = b"It's just a flesh wound" + + def test_endheaders_chunked(self): + conn = client.HTTPConnection('example.com') + conn.sock = FakeSocket(b'') + conn.putrequest('POST', '/') + conn.endheaders(self._make_body(), encode_chunked=True) + + _, _, body = self._parse_request(conn.sock.data) + body = self._parse_chunked(body) + self.assertEqual(body, self.expected_body) + + def test_explicit_headers(self): + # explicit chunked + conn = client.HTTPConnection('example.com') + conn.sock = FakeSocket(b'') + # this shouldn't actually be automatically chunk-encoded because the + # calling code has explicitly stated that it's taking care of it + conn.request( + 'POST', '/', self._make_body(), {'Transfer-Encoding': 'chunked'}) + + _, headers, body = self._parse_request(conn.sock.data) + self.assertNotIn('content-length', [k.lower() for k in headers.keys()]) + self.assertEqual(headers['Transfer-Encoding'], 'chunked') + self.assertEqual(body, self.expected_body) + + # explicit chunked, string body + conn = client.HTTPConnection('example.com') + conn.sock = FakeSocket(b'') + conn.request( + 'POST', '/', self.expected_body.decode('latin-1'), + {'Transfer-Encoding': 'chunked'}) + + _, headers, body = self._parse_request(conn.sock.data) + self.assertNotIn('content-length', [k.lower() for k in headers.keys()]) + self.assertEqual(headers['Transfer-Encoding'], 'chunked') + self.assertEqual(body, self.expected_body) + + # User-specified TE, but request() does the chunk encoding + conn = client.HTTPConnection('example.com') + conn.sock = FakeSocket(b'') + conn.request('POST', '/', + headers={'Transfer-Encoding': 'gzip, chunked'}, + encode_chunked=True, + body=self._make_body()) + _, headers, body = self._parse_request(conn.sock.data) + self.assertNotIn('content-length', [k.lower() for k in headers]) + self.assertEqual(headers['Transfer-Encoding'], 'gzip, chunked') + self.assertEqual(self._parse_chunked(body), self.expected_body) + + def test_request(self): + for empty_lines in (False, True,): + conn = client.HTTPConnection('example.com') + conn.sock = FakeSocket(b'') + conn.request( + 'POST', '/', self._make_body(empty_lines=empty_lines)) + + _, headers, body = self._parse_request(conn.sock.data) + body = self._parse_chunked(body) + self.assertEqual(body, self.expected_body) + self.assertEqual(headers['Transfer-Encoding'], 'chunked') + + # Content-Length and Transfer-Encoding SHOULD not be sent in the + # same request + self.assertNotIn('content-length', [k.lower() for k in headers]) + + def _make_body(self, empty_lines=False): + lines = self.expected_body.split(b' ') + for idx, line in enumerate(lines): + # for testing handling empty lines + if empty_lines and idx % 2: + yield b'' + if idx < len(lines) - 1: + yield line + b' ' + else: + yield line + + def _parse_request(self, data): + lines = data.split(b'\r\n') + request = lines[0] + headers = {} + n = 1 + while n < len(lines) and len(lines[n]) > 0: + key, val = lines[n].split(b':') + key = key.decode('latin-1').strip() + headers[key] = val.decode('latin-1').strip() + n += 1 + + return request, headers, b'\r\n'.join(lines[n + 1:]) + + def _parse_chunked(self, data): + body = [] + trailers = {} + n = 0 + lines = data.split(b'\r\n') + # parse body + while True: + size, chunk = lines[n:n+2] + size = int(size, 16) + + if size == 0: + n += 1 + break + + self.assertEqual(size, len(chunk)) + body.append(chunk) + + n += 2 + # we /should/ hit the end chunk, but check against the size of + # lines so we're not stuck in an infinite loop should we get + # malformed data + if n > len(lines): + break + + return b''.join(body) + + class BasicTest(TestCase): def test_status_lines(self): # Test HTTP status lines @@ -564,11 +682,11 @@ class BasicTest(TestCase): yield None yield 'data_two' - class UpdatingFile(): + class UpdatingFile(io.TextIOBase): mode = 'r' d = data() def read(self, blocksize=-1): - return self.d.__next__() + return next(self.d) expected = b'data' @@ -1546,6 +1664,26 @@ class RequestBodyTest(TestCase): message = client.parse_headers(f) return message, f + def test_list_body(self): + # Note that no content-length is automatically calculated for + # an iterable. The request will fall back to send chunked + # transfer encoding. + cases = ( + ([b'foo', b'bar'], b'3\r\nfoo\r\n3\r\nbar\r\n0\r\n\r\n'), + ((b'foo', b'bar'), b'3\r\nfoo\r\n3\r\nbar\r\n0\r\n\r\n'), + ) + for body, expected in cases: + with self.subTest(body): + self.conn = client.HTTPConnection('example.com') + self.conn.sock = self.sock = FakeSocket('') + + self.conn.request('PUT', '/url', body) + msg, f = self.get_headers_and_fp() + self.assertNotIn('Content-Type', msg) + self.assertNotIn('Content-Length', msg) + self.assertEqual(msg.get('Transfer-Encoding'), 'chunked') + self.assertEqual(expected, f.read()) + def test_manual_content_length(self): # Set an incorrect content-length so that we can verify that # it will not be over-ridden by the library. @@ -1588,8 +1726,13 @@ class RequestBodyTest(TestCase): message, f = self.get_headers_and_fp() self.assertEqual("text/plain", message.get_content_type()) self.assertIsNone(message.get_charset()) - self.assertEqual("4", message.get("content-length")) - self.assertEqual(b'body', f.read()) + # Note that the length of text files is unpredictable + # because it depends on character encoding and line ending + # translation. No content-length will be set, the body + # will be sent using chunked transfer encoding. + self.assertIsNone(message.get("content-length")) + self.assertEqual("chunked", message.get("transfer-encoding")) + self.assertEqual(b'4\r\nbody\r\n0\r\n\r\n', f.read()) def test_binary_file_body(self): self.addCleanup(support.unlink, support.TESTFN) diff --git a/Lib/test/test_urllib2.py b/Lib/test/test_urllib2.py index eda7ccc..0eea0c7 100644 --- a/Lib/test/test_urllib2.py +++ b/Lib/test/test_urllib2.py @@ -7,6 +7,8 @@ import io import socket import array import sys +import tempfile +import subprocess import urllib.request # The proxy bypass method imported below has logic specific to the OSX @@ -335,7 +337,8 @@ class MockHTTPClass: else: self._tunnel_headers.clear() - def request(self, method, url, body=None, headers=None): + def request(self, method, url, body=None, headers=None, *, + encode_chunked=False): self.method = method self.selector = url if headers is not None: @@ -343,6 +346,7 @@ class MockHTTPClass: self.req_headers.sort() if body: self.data = body + self.encode_chunked = encode_chunked if self.raise_on_endheaders: raise OSError() @@ -908,41 +912,96 @@ class HandlerTests(unittest.TestCase): self.assertEqual(req.unredirected_hdrs["Host"], "baz") self.assertEqual(req.unredirected_hdrs["Spam"], "foo") - # Check iterable body support - def iterable_body(): - yield b"one" - yield b"two" - yield b"three" + def test_http_body_file(self): + # A regular file - Content Length is calculated unless already set. - for headers in {}, {"Content-Length": 11}: - req = Request("http://example.com/", iterable_body(), headers) - if not headers: - # Having an iterable body without a Content-Length should - # raise an exception - self.assertRaises(ValueError, h.do_request_, req) - else: + h = urllib.request.AbstractHTTPHandler() + o = h.parent = MockOpener() + + file_obj = tempfile.NamedTemporaryFile(mode='w+b', delete=False) + file_path = file_obj.name + file_obj.write(b"Something\nSomething\nSomething\n") + file_obj.close() + + for headers in {}, {"Content-Length": 30}: + with open(file_path, "rb") as f: + req = Request("http://example.com/", f, headers) newreq = h.do_request_(req) + self.assertEqual(int(newreq.get_header('Content-length')), 30) - # A file object. - # Test only Content-Length attribute of request. + os.unlink(file_path) + + def test_http_body_fileobj(self): + # A file object - Content Length is calculated unless already set. + # (Note that there are some subtle differences to a regular + # file, that is why we are testing both cases.) + + h = urllib.request.AbstractHTTPHandler() + o = h.parent = MockOpener() file_obj = io.BytesIO() file_obj.write(b"Something\nSomething\nSomething\n") for headers in {}, {"Content-Length": 30}: + file_obj.seek(0) req = Request("http://example.com/", file_obj, headers) - if not headers: - # Having an iterable body without a Content-Length should - # raise an exception - self.assertRaises(ValueError, h.do_request_, req) - else: - newreq = h.do_request_(req) - self.assertEqual(int(newreq.get_header('Content-length')), 30) + newreq = h.do_request_(req) + self.assertEqual(int(newreq.get_header('Content-length')), 30) file_obj.close() + def test_http_body_pipe(self): + # A file reading from a pipe. + # A pipe cannot be seek'ed. There is no way to determine the + # content length up front. Thus, do_request_() should fall + # back to Transfer-encoding chunked. + + h = urllib.request.AbstractHTTPHandler() + o = h.parent = MockOpener() + + cmd = [sys.executable, "-c", + r"import sys; " + r"sys.stdout.buffer.write(b'Something\nSomething\nSomething\n')"] + for headers in {}, {"Content-Length": 30}: + with subprocess.Popen(cmd, stdout=subprocess.PIPE) as proc: + req = Request("http://example.com/", proc.stdout, headers) + newreq = h.do_request_(req) + if not headers: + self.assertEqual(newreq.get_header('Content-length'), None) + self.assertEqual(newreq.get_header('Transfer-encoding'), + 'chunked') + else: + self.assertEqual(int(newreq.get_header('Content-length')), + 30) + + def test_http_body_iterable(self): + # Generic iterable. There is no way to determine the content + # length up front. Fall back to Transfer-encoding chunked. + + h = urllib.request.AbstractHTTPHandler() + o = h.parent = MockOpener() + + def iterable_body(): + yield b"one" + yield b"two" + yield b"three" + + for headers in {}, {"Content-Length": 11}: + req = Request("http://example.com/", iterable_body(), headers) + newreq = h.do_request_(req) + if not headers: + self.assertEqual(newreq.get_header('Content-length'), None) + self.assertEqual(newreq.get_header('Transfer-encoding'), + 'chunked') + else: + self.assertEqual(int(newreq.get_header('Content-length')), 11) + + def test_http_body_array(self): # array.array Iterable - Content Length is calculated + h = urllib.request.AbstractHTTPHandler() + o = h.parent = MockOpener() + iterable_array = array.array("I",[1,2,3,4]) for headers in {}, {"Content-Length": 16}: diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index dc436bc..30bf6e0 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -141,17 +141,9 @@ def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, *, cafile=None, capath=None, cadefault=False, context=None): '''Open the URL url, which can be either a string or a Request object. - *data* must be a bytes object specifying additional data to be sent to the - server, or None if no such data is needed. data may also be an iterable - object and in that case Content-Length value must be specified in the - headers. Currently HTTP requests are the only ones that use data; the HTTP - request will be a POST instead of a GET when the data parameter is - provided. - - *data* should be a buffer in the standard application/x-www-form-urlencoded - format. The urllib.parse.urlencode() function takes a mapping or sequence - of 2-tuples and returns an ASCII text string in this format. It should be - encoded to bytes before being used as the data parameter. + *data* must be an object specifying additional data to be sent to + the server, or None if no such data is needed. See Request for + details. urllib.request module uses HTTP/1.1 and includes a "Connection:close" header in its HTTP requests. @@ -1235,6 +1227,11 @@ class AbstractHTTPHandler(BaseHandler): def set_http_debuglevel(self, level): self._debuglevel = level + def _get_content_length(self, request): + return http.client.HTTPConnection._get_content_length( + request.data, + request.get_method()) + def do_request_(self, request): host = request.host if not host: @@ -1243,24 +1240,22 @@ class AbstractHTTPHandler(BaseHandler): if request.data is not None: # POST data = request.data if isinstance(data, str): - msg = "POST data should be bytes or an iterable of bytes. " \ - "It cannot be of type str." + msg = "POST data should be bytes, an iterable of bytes, " \ + "or a file object. It cannot be of type str." raise TypeError(msg) if not request.has_header('Content-type'): request.add_unredirected_header( 'Content-type', 'application/x-www-form-urlencoded') - if not request.has_header('Content-length'): - try: - mv = memoryview(data) - except TypeError: - if isinstance(data, collections.Iterable): - raise ValueError("Content-Length should be specified " - "for iterable data of type %r %r" % (type(data), - data)) + if (not request.has_header('Content-length') + and not request.has_header('Transfer-encoding')): + content_length = self._get_content_length(request) + if content_length is not None: + request.add_unredirected_header( + 'Content-length', str(content_length)) else: request.add_unredirected_header( - 'Content-length', '%d' % (len(mv) * mv.itemsize)) + 'Transfer-encoding', 'chunked') sel_host = host if request.has_proxy(): @@ -1316,7 +1311,8 @@ class AbstractHTTPHandler(BaseHandler): try: try: - h.request(req.get_method(), req.selector, req.data, headers) + h.request(req.get_method(), req.selector, req.data, headers, + encode_chunked=req.has_header('Transfer-encoding')) except OSError as err: # timeout error raise URLError(err) r = h.getresponse() diff --git a/Misc/ACKS b/Misc/ACKS index 9cc1c1e..46f4ae7 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -795,6 +795,7 @@ Daniel Kozan Jerzy Kozera Maksim Kozyarchuk Stefan Krah +Rolf Krahl Bob Kras Sebastian Kreft Holger Krekel diff --git a/Misc/NEWS b/Misc/NEWS index 5d98cdd..e0cd715 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -160,6 +160,14 @@ Library then further affects other traceback display operations in the module). Patch by Emanuel Barry. +- Issue #12319: Chunked transfer encoding support added to + http.client.HTTPConnection requests. The + urllib.request.AbstractHTTPHandler class does not enforce a Content-Length + header any more. If a HTTP request has a non-empty body, but no + Content-Length header, and the content length cannot be determined + up front, rather than throwing an error, the library now falls back + to use chunked transfer encoding. + - Issue #27664: Add to concurrent.futures.thread.ThreadPoolExecutor() the ability to specify a thread name prefix. -- cgit v0.12