From df204be922b4fecc0f7316248bb2ce585e816510 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Sat, 24 Nov 2012 17:59:08 +0100 Subject: =?UTF-8?q?Issue=20#16423:=20urllib.request=20now=20has=20support?= =?UTF-8?q?=20for=20``data:``=20URLs.=20Patch=20by=20Mathias=20Panzenb?= =?UTF-8?q?=C3=B6ck.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Doc/library/urllib.request.rst | 26 +++++++++++++-- Lib/test/test_urllib.py | 74 ++++++++++++++++++++++++++++++++++++++++++ Lib/urllib/request.py | 38 ++++++++++++++++++++-- Misc/ACKS | 1 + Misc/NEWS | 3 ++ 5 files changed, 137 insertions(+), 5 deletions(-) diff --git a/Doc/library/urllib.request.rst b/Doc/library/urllib.request.rst index 21255e5..84c387c 100644 --- a/Doc/library/urllib.request.rst +++ b/Doc/library/urllib.request.rst @@ -121,7 +121,7 @@ The :mod:`urllib.request` module defines the following functions: instances of them or subclasses of them: :class:`ProxyHandler`, :class:`UnknownHandler`, :class:`HTTPHandler`, :class:`HTTPDefaultErrorHandler`, :class:`HTTPRedirectHandler`, :class:`FTPHandler`, :class:`FileHandler`, - :class:`HTTPErrorProcessor`. + :class:`HTTPErrorProcessor`, :class:`DataHandler`. If the Python installation has SSL support (i.e., if the :mod:`ssl` module can be imported), :class:`HTTPSHandler` will also be added. @@ -346,6 +346,11 @@ The following classes are provided: Open local files. +.. class:: DataHandler() + + Open data URLs. + + .. versionadded:: 3.4 .. class:: FTPHandler() @@ -972,6 +977,21 @@ FileHandler Objects hostname is given, an :exc:`URLError` is raised. +.. _data-handler-objects: + +DataHandler Objects +------------------- + +.. method:: DataHandler.data_open(req) + + Read a data URL. This kind of URL contains the content encoded in the URL + itself. The data URL syntax is specified in :rfc:`2397`. This implementation + ignores white spaces in base64 encoded data URLs so the URL may be wrapped + in whatever source file it comes from. But even though some browsers don't + mind about a missing padding at the end of a base64 encoded data URL, this + implementation will raise an :exc:`ValueError` in that case. + + .. _ftp-handler-objects: FTPHandler Objects @@ -1374,7 +1394,9 @@ some point in the future. pair: FTP; protocol * Currently, only the following protocols are supported: HTTP (versions 0.9 and - 1.0), FTP, and local files. + 1.0), FTP, local files, and data URLs. + + .. versionchanged:: 3.4 Added support for data URLs. * The caching feature of :func:`urlretrieve` has been disabled until someone finds the time to hack proper processing of Expiration time headers. diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 52b9292..1e30fa6 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -337,6 +337,79 @@ Content-Type: text/html; charset=iso-8859-1 with support.check_warnings(('',DeprecationWarning)): urllib.request.URLopener() +class urlopen_DataTests(unittest.TestCase): + """Test urlopen() opening a data URL.""" + + def setUp(self): + # text containing URL special- and unicode-characters + self.text = "test data URLs :;,%=& \u00f6 \u00c4 " + # 2x1 pixel RGB PNG image with one black and one white pixel + self.image = ( + b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x02\x00\x00\x00' + b'\x01\x08\x02\x00\x00\x00{@\xe8\xdd\x00\x00\x00\x01sRGB\x00\xae' + b'\xce\x1c\xe9\x00\x00\x00\x0fIDAT\x08\xd7c```\xf8\xff\xff?\x00' + b'\x06\x01\x02\xfe\no/\x1e\x00\x00\x00\x00IEND\xaeB`\x82') + + self.text_url = ( + "data:text/plain;charset=UTF-8,test%20data%20URLs%20%3A%3B%2C%25%3" + "D%26%20%C3%B6%20%C3%84%20") + self.text_url_base64 = ( + "data:text/plain;charset=ISO-8859-1;base64,dGVzdCBkYXRhIFVSTHMgOjs" + "sJT0mIPYgxCA%3D") + # base64 encoded data URL that contains ignorable spaces, + # such as "\n", " ", "%0A", and "%20". + self.image_url = ( + "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAIAAAABCAIAAAB7\n" + "QOjdAAAAAXNSR0IArs4c6QAAAA9JREFUCNdj%0AYGBg%2BP//PwAGAQL%2BCm8 " + "vHgAAAABJRU5ErkJggg%3D%3D%0A%20") + + self.text_url_resp = urllib.request.urlopen(self.text_url) + self.text_url_base64_resp = urllib.request.urlopen( + self.text_url_base64) + self.image_url_resp = urllib.request.urlopen(self.image_url) + + def test_interface(self): + # Make sure object returned by urlopen() has the specified methods + for attr in ("read", "readline", "readlines", + "close", "info", "geturl", "getcode", "__iter__"): + self.assertTrue(hasattr(self.text_url_resp, attr), + "object returned by urlopen() lacks %s attribute" % + attr) + + def test_info(self): + self.assertIsInstance(self.text_url_resp.info(), email.message.Message) + self.assertEqual(self.text_url_base64_resp.info().get_params(), + [('text/plain', ''), ('charset', 'ISO-8859-1')]) + self.assertEqual(self.image_url_resp.info()['content-length'], + str(len(self.image))) + self.assertEqual(urllib.request.urlopen("data:,").info().get_params(), + [('text/plain', ''), ('charset', 'US-ASCII')]) + + def test_geturl(self): + self.assertEqual(self.text_url_resp.geturl(), self.text_url) + self.assertEqual(self.text_url_base64_resp.geturl(), + self.text_url_base64) + self.assertEqual(self.image_url_resp.geturl(), self.image_url) + + def test_read_text(self): + self.assertEqual(self.text_url_resp.read().decode( + dict(self.text_url_resp.info().get_params())['charset']), self.text) + + def test_read_text_base64(self): + self.assertEqual(self.text_url_base64_resp.read().decode( + dict(self.text_url_base64_resp.info().get_params())['charset']), + self.text) + + def test_read_image(self): + self.assertEqual(self.image_url_resp.read(), self.image) + + def test_missing_comma(self): + self.assertRaises(ValueError,urllib.request.urlopen,'data:text/plain') + + def test_invalid_base64_data(self): + # missing padding character + self.assertRaises(ValueError,urllib.request.urlopen,'data:;base64,Cg=') + class urlretrieve_FileTests(unittest.TestCase): """Test urllib.urlretrieve() on local files""" @@ -1313,6 +1386,7 @@ def test_main(): support.run_unittest( urlopen_FileTests, urlopen_HttpTests, + urlopen_DataTests, urlretrieve_FileTests, urlretrieve_HttpTests, ProxyTests, diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index d389fa9..74725f9 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -103,7 +103,8 @@ from urllib.error import URLError, HTTPError, ContentTooShortError from urllib.parse import ( urlparse, urlsplit, urljoin, unwrap, quote, unquote, splittype, splithost, splitport, splituser, splitpasswd, - splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse) + splitattr, splitquery, splitvalue, splittag, to_bytes, + unquote_to_bytes, urlunparse) from urllib.response import addinfourl, addclosehook # check for SSL @@ -121,7 +122,7 @@ __all__ = [ 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm', 'AbstractBasicAuthHandler', 'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', 'AbstractDigestAuthHandler', 'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', - 'HTTPHandler', 'FileHandler', 'FTPHandler', 'CacheFTPHandler', + 'HTTPHandler', 'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler', 'UnknownHandler', 'HTTPErrorProcessor', # Functions 'urlopen', 'install_opener', 'build_opener', @@ -535,7 +536,8 @@ def build_opener(*handlers): opener = OpenerDirector() default_classes = [ProxyHandler, UnknownHandler, HTTPHandler, HTTPDefaultErrorHandler, HTTPRedirectHandler, - FTPHandler, FileHandler, HTTPErrorProcessor] + FTPHandler, FileHandler, HTTPErrorProcessor, + DataHandler] if hasattr(http.client, "HTTPSConnection"): default_classes.append(HTTPSHandler) skip = set() @@ -1541,6 +1543,36 @@ class CacheFTPHandler(FTPHandler): self.cache.clear() self.timeout.clear() +class DataHandler(BaseHandler): + def data_open(self, req): + # data URLs as specified in RFC 2397. + # + # ignores POSTed data + # + # syntax: + # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data + # mediatype := [ type "/" subtype ] *( ";" parameter ) + # data := *urlchar + # parameter := attribute "=" value + url = req.full_url + + scheme, data = url.split(":",1) + mediatype, data = data.split(",",1) + + # even base64 encoded data URLs might be quoted so unquote in any case: + data = unquote_to_bytes(data) + if mediatype.endswith(";base64"): + data = base64.decodebytes(data) + mediatype = mediatype[:-7] + + if not mediatype: + mediatype = "text/plain;charset=US-ASCII" + + headers = email.message_from_string("Content-type: %s\nContent-length: %d\n" % + (mediatype, len(data))) + + return addinfourl(io.BytesIO(data), headers, url) + # Code move from the old urllib module diff --git a/Misc/ACKS b/Misc/ACKS index 45ef0c7..0b29321 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -884,6 +884,7 @@ Mike Pall Todd R. Palmer Juan David Ibáñez Palomar Jan Palus +Mathias Panzenböck M. Papillon Peter Parente Alexandre Parenteau diff --git a/Misc/NEWS b/Misc/NEWS index 8fe37dc..619a851 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -138,6 +138,9 @@ Core and Builtins Library ------- +- Issue #16423: urllib.request now has support for ``data:`` URLs. Patch by + Mathias Panzenböck. + - Issue #4473: Add a POP3.stls() to switch a clear-text POP3 session into an encrypted POP3 session, on supported servers. Patch by Lorenzo Catucci. -- cgit v0.12