From 66cdb2bd8ab93a4691bead7f5d1e54e5ca6895b4 Mon Sep 17 00:00:00 2001 From: Barney Gale Date: Thu, 10 Apr 2025 20:58:04 +0100 Subject: GH-123599: `url2pathname()`: handle authority section in file URL (#126844) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In `urllib.request.url2pathname()`, if the authority resolves to the current host, discard it. If an authority is present but resolves somewhere else, then on Windows we return a UNC path (as before), and on other platforms we raise `URLError`. Affects `pathlib.Path.from_uri()` in the same way. Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com> Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> --- Doc/library/pathlib.rst | 6 +++ Doc/library/urllib.request.rst | 17 ++++--- Doc/whatsnew/3.14.rst | 19 ++++++++ Lib/pathlib/__init__.py | 6 ++- Lib/test/test_pathlib/test_pathlib.py | 6 ++- Lib/test/test_urllib.py | 43 ++++++++++++++---- Lib/urllib/request.py | 52 +++++++++------------- .../2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst | 5 +++ 8 files changed, 106 insertions(+), 48 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index 708a16e..b829869 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -871,6 +871,12 @@ conforming to :rfc:`8089`. .. versionadded:: 3.13 + .. versionchanged:: next + If a URL authority (e.g. a hostname) is present and resolves to a local + address, it is discarded. If an authority is present and *doesn't* + resolve to a local address, then on Windows a UNC path is returned (as + before), and on other platforms a :exc:`ValueError` is raised. + .. method:: Path.as_uri() diff --git a/Doc/library/urllib.request.rst b/Doc/library/urllib.request.rst index 8b54e10..edfc249 100644 --- a/Doc/library/urllib.request.rst +++ b/Doc/library/urllib.request.rst @@ -159,15 +159,15 @@ The :mod:`urllib.request` module defines the following functions: 'file:///C:/Program%20Files' .. versionchanged:: 3.14 - Paths beginning with a slash are converted to URLs with authority - sections. For example, the path ``/etc/hosts`` is converted to - the URL ``///etc/hosts``. - - .. versionchanged:: 3.14 Windows drive letters are no longer converted to uppercase, and ``:`` characters not following a drive letter no longer cause an :exc:`OSError` exception to be raised on Windows. + .. versionchanged:: 3.14 + Paths beginning with a slash are converted to URLs with authority + sections. For example, the path ``/etc/hosts`` is converted to + the URL ``///etc/hosts``. + .. function:: url2pathname(url) @@ -186,6 +186,13 @@ The :mod:`urllib.request` module defines the following functions: characters not following a drive letter no longer cause an :exc:`OSError` exception to be raised on Windows. + .. versionchanged:: next + This function calls :func:`socket.gethostbyname` if the URL authority + isn't empty or ``localhost``. If the authority resolves to a local IP + address then it is discarded; otherwise, on Windows a UNC path is + returned (as before), and on other platforms a + :exc:`~urllib.error.URLError` is raised. + .. function:: getproxies() diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index 0f15a2a..f8cae78 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -1197,6 +1197,25 @@ urllib supporting SHA-256 digest authentication as specified in :rfc:`7616`. (Contributed by Calvin Bui in :gh:`128193`.) +* Improve standards compliance when parsing and emitting ``file:`` URLs. + + In :func:`urllib.request.url2pathname`: + + - Discard URL authorities that resolve to a local IP address. + - Raise :exc:`~urllib.error.URLError` if a URL authority doesn't resolve + to ``localhost``, except on Windows where we return a UNC path. + + In :func:`urllib.request.pathname2url`: + + - Include an empty URL authority when a path begins with a slash. For + example, the path ``/etc/hosts`` is converted to the URL ``///etc/hosts``. + + On Windows, drive letters are no longer converted to uppercase, and ``:`` + characters not following a drive letter no longer cause an :exc:`OSError` + exception to be raised. + + (Contributed by Barney Gale in :gh:`125866`.) + uuid ---- diff --git a/Lib/pathlib/__init__.py b/Lib/pathlib/__init__.py index cd28f62..43a5440 100644 --- a/Lib/pathlib/__init__.py +++ b/Lib/pathlib/__init__.py @@ -1278,8 +1278,12 @@ class Path(PurePath): """Return a new path from the given 'file' URI.""" if not uri.startswith('file:'): raise ValueError(f"URI does not start with 'file:': {uri!r}") + from urllib.error import URLError from urllib.request import url2pathname - path = cls(url2pathname(uri.removeprefix('file:'))) + try: + path = cls(url2pathname(uri.removeprefix('file:'))) + except URLError as exc: + raise ValueError(exc.reason) from None if not path.is_absolute(): raise ValueError(f"URI is not absolute: {uri!r}") return path diff --git a/Lib/test/test_pathlib/test_pathlib.py b/Lib/test/test_pathlib/test_pathlib.py index b1fcc5f..00ec17e 100644 --- a/Lib/test/test_pathlib/test_pathlib.py +++ b/Lib/test/test_pathlib/test_pathlib.py @@ -3285,10 +3285,14 @@ class PathTest(PurePathTest): def test_from_uri_posix(self): P = self.cls self.assertEqual(P.from_uri('file:/foo/bar'), P('/foo/bar')) - self.assertEqual(P.from_uri('file://foo/bar'), P('//foo/bar')) + self.assertRaises(ValueError, P.from_uri, 'file://foo/bar') self.assertEqual(P.from_uri('file:///foo/bar'), P('/foo/bar')) self.assertEqual(P.from_uri('file:////foo/bar'), P('//foo/bar')) self.assertEqual(P.from_uri('file://localhost/foo/bar'), P('/foo/bar')) + if not is_wasi: + self.assertEqual(P.from_uri('file://127.0.0.1/foo/bar'), P('/foo/bar')) + self.assertEqual(P.from_uri(f'file://{socket.gethostname()}/foo/bar'), + P('/foo/bar')) self.assertRaises(ValueError, P.from_uri, 'foo/bar') self.assertRaises(ValueError, P.from_uri, '/foo/bar') self.assertRaises(ValueError, P.from_uri, '//foo/bar') diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index ed23215..ecf429e 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -11,6 +11,7 @@ from test import support from test.support import os_helper from test.support import socket_helper import os +import socket try: import ssl except ImportError: @@ -1424,6 +1425,17 @@ class Pathname_Tests(unittest.TestCase): "url2pathname() failed; %s != %s" % (expect, result)) + def test_pathname2url(self): + # Test cases common to Windows and POSIX. + fn = urllib.request.pathname2url + sep = os.path.sep + self.assertEqual(fn(''), '') + self.assertEqual(fn(sep), '///') + self.assertEqual(fn('a'), 'a') + self.assertEqual(fn(f'a{sep}b.c'), 'a/b.c') + self.assertEqual(fn(f'{sep}a{sep}b.c'), '///a/b.c') + self.assertEqual(fn(f'{sep}a{sep}b%#c'), '///a/b%25%23c') + @unittest.skipUnless(sys.platform == 'win32', 'test specific to Windows pathnames.') def test_pathname2url_win(self): @@ -1466,12 +1478,9 @@ class Pathname_Tests(unittest.TestCase): 'test specific to POSIX pathnames') def test_pathname2url_posix(self): fn = urllib.request.pathname2url - self.assertEqual(fn('/'), '///') - self.assertEqual(fn('/a/b.c'), '///a/b.c') self.assertEqual(fn('//a/b.c'), '////a/b.c') self.assertEqual(fn('///a/b.c'), '/////a/b.c') self.assertEqual(fn('////a/b.c'), '//////a/b.c') - self.assertEqual(fn('/a/b%#c'), '///a/b%25%23c') @unittest.skipUnless(os_helper.FS_NONASCII, 'need os_helper.FS_NONASCII') def test_pathname2url_nonascii(self): @@ -1480,11 +1489,25 @@ class Pathname_Tests(unittest.TestCase): url = urllib.parse.quote(os_helper.FS_NONASCII, encoding=encoding, errors=errors) self.assertEqual(urllib.request.pathname2url(os_helper.FS_NONASCII), url) + def test_url2pathname(self): + # Test cases common to Windows and POSIX. + fn = urllib.request.url2pathname + sep = os.path.sep + self.assertEqual(fn(''), '') + self.assertEqual(fn('/'), f'{sep}') + self.assertEqual(fn('///'), f'{sep}') + self.assertEqual(fn('////'), f'{sep}{sep}') + self.assertEqual(fn('foo'), 'foo') + self.assertEqual(fn('foo/bar'), f'foo{sep}bar') + self.assertEqual(fn('/foo/bar'), f'{sep}foo{sep}bar') + self.assertEqual(fn('//localhost/foo/bar'), f'{sep}foo{sep}bar') + self.assertEqual(fn('///foo/bar'), f'{sep}foo{sep}bar') + self.assertEqual(fn('////foo/bar'), f'{sep}{sep}foo{sep}bar') + @unittest.skipUnless(sys.platform == 'win32', 'test specific to Windows pathnames.') def test_url2pathname_win(self): fn = urllib.request.url2pathname - self.assertEqual(fn('/'), '\\') self.assertEqual(fn('/C:/'), 'C:\\') self.assertEqual(fn("///C|"), 'C:') self.assertEqual(fn("///C:"), 'C:') @@ -1530,11 +1553,13 @@ class Pathname_Tests(unittest.TestCase): 'test specific to POSIX pathnames') def test_url2pathname_posix(self): fn = urllib.request.url2pathname - self.assertEqual(fn('/foo/bar'), '/foo/bar') - self.assertEqual(fn('//foo/bar'), '//foo/bar') - self.assertEqual(fn('///foo/bar'), '/foo/bar') - self.assertEqual(fn('////foo/bar'), '//foo/bar') - self.assertEqual(fn('//localhost/foo/bar'), '/foo/bar') + self.assertRaises(urllib.error.URLError, fn, '//foo/bar') + self.assertRaises(urllib.error.URLError, fn, '//localhost:/foo/bar') + self.assertRaises(urllib.error.URLError, fn, '//:80/foo/bar') + self.assertRaises(urllib.error.URLError, fn, '//:/foo/bar') + self.assertRaises(urllib.error.URLError, fn, '//c:80/foo/bar') + self.assertEqual(fn('//127.0.0.1/foo/bar'), '/foo/bar') + self.assertEqual(fn(f'//{socket.gethostname()}/foo/bar'), '/foo/bar') @unittest.skipUnless(os_helper.FS_NONASCII, 'need os_helper.FS_NONASCII') def test_url2pathname_nonascii(self): diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index f22dc56..84c075e 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -1450,16 +1450,6 @@ def parse_http_list(s): return [part.strip() for part in res] class FileHandler(BaseHandler): - # Use local file or FTP depending on form of URL - def file_open(self, req): - url = req.selector - if url[:2] == '//' and url[2:3] != '/' and (req.host and - req.host != 'localhost'): - if not req.host in self.get_names(): - raise URLError("file:// scheme is supported only on localhost") - else: - return self.open_local_file(req) - # names for the localhost names = None def get_names(self): @@ -1476,8 +1466,7 @@ class FileHandler(BaseHandler): def open_local_file(self, req): import email.utils import mimetypes - host = req.host - filename = req.selector + filename = _splittype(req.full_url)[1] localfile = url2pathname(filename) try: stats = os.stat(localfile) @@ -1487,21 +1476,21 @@ class FileHandler(BaseHandler): headers = email.message_from_string( 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % (mtype or 'text/plain', size, modified)) - if host: - host, port = _splitport(host) - if not host or \ - (not port and _safe_gethostbyname(host) in self.get_names()): - origurl = 'file:' + pathname2url(localfile) - return addinfourl(open(localfile, 'rb'), headers, origurl) + origurl = f'file:{pathname2url(localfile)}' + return addinfourl(open(localfile, 'rb'), headers, origurl) except OSError as exp: raise URLError(exp, exp.filename) - raise URLError('file not on local host') -def _safe_gethostbyname(host): + file_open = open_local_file + +def _is_local_authority(authority): + if not authority or authority == 'localhost': + return True try: - return socket.gethostbyname(host) - except socket.gaierror: - return None + address = socket.gethostbyname(authority) + except (socket.gaierror, AttributeError): + return False + return address in FileHandler().get_names() class FTPHandler(BaseHandler): def ftp_open(self, req): @@ -1649,16 +1638,13 @@ class DataHandler(BaseHandler): def url2pathname(url): """OS-specific conversion from a relative URL of the 'file' scheme to a file system path; not recommended for general use.""" - if url[:3] == '///': - # Empty authority section, so the path begins on the third character. - url = url[2:] - elif url[:12] == '//localhost/': - # Skip past 'localhost' authority. - url = url[11:] - + authority, url = _splithost(url) if os.name == 'nt': - if url[:3] == '///': - # Skip past extra slash before UNC drive in URL path. + if not _is_local_authority(authority): + # e.g. file://server/share/file.txt + url = '//' + authority + url + elif url[:3] == '///': + # e.g. file://///server/share/file.txt url = url[1:] else: if url[:1] == '/' and url[2:3] in (':', '|'): @@ -1668,6 +1654,8 @@ def url2pathname(url): # Older URLs use a pipe after a drive letter url = url[:1] + ':' + url[2:] url = url.replace('/', '\\') + elif not _is_local_authority(authority): + raise URLError("file:// scheme is supported only on localhost") encoding = sys.getfilesystemencoding() errors = sys.getfilesystemencodeerrors() return unquote(url, encoding=encoding, errors=errors) diff --git a/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst b/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst new file mode 100644 index 0000000..857cc35 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst @@ -0,0 +1,5 @@ +Fix issue where :func:`urllib.request.url2pathname` mishandled file URLs with +authorities. If an authority is present and resolves to ``localhost``, it is +now discarded. If an authority is present but *doesn't* resolve to +``localhost``, then on Windows a UNC path is returned (as before), and on +other platforms a :exc:`urllib.error.URLError` is now raised. -- cgit v0.12