diff options
-rw-r--r-- | Lib/test/test_urlparse.py | 18 | ||||
-rw-r--r-- | Lib/urllib/parse.py | 58 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Library/2021-05-01-15-43-37.bpo-44002.KLT_wd.rst | 5 |
3 files changed, 50 insertions, 31 deletions
diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index 31943f3..dff9a8e 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -1044,16 +1044,24 @@ class UrlParseTestCase(unittest.TestCase): self.assertEqual(p1.params, 'phone-context=+1-914-555') def test_Quoter_repr(self): - quoter = urllib.parse.Quoter(urllib.parse._ALWAYS_SAFE) + quoter = urllib.parse._Quoter(urllib.parse._ALWAYS_SAFE) self.assertIn('Quoter', repr(quoter)) + def test_clear_cache_for_code_coverage(self): + urllib.parse.clear_cache() + + def test_urllib_parse_getattr_failure(self): + """Test that urllib.parse.__getattr__() fails correctly.""" + with self.assertRaises(AttributeError): + unused = urllib.parse.this_does_not_exist + def test_all(self): expected = [] undocumented = { 'splitattr', 'splithost', 'splitnport', 'splitpasswd', 'splitport', 'splitquery', 'splittag', 'splittype', 'splituser', 'splitvalue', - 'Quoter', 'ResultBase', 'clear_cache', 'to_bytes', 'unwrap', + 'ResultBase', 'clear_cache', 'to_bytes', 'unwrap', } for name in dir(urllib.parse): if name.startswith('_') or name in undocumented: @@ -1245,6 +1253,12 @@ class Utility_Tests(unittest.TestCase): class DeprecationTest(unittest.TestCase): + def test_Quoter_deprecation(self): + with self.assertWarns(DeprecationWarning) as cm: + old_class = urllib.parse.Quoter + self.assertIs(old_class, urllib.parse._Quoter) + self.assertIn('Quoter will be removed', str(cm.warning)) + def test_splittype_deprecation(self): with self.assertWarns(DeprecationWarning) as cm: urllib.parse.splittype('') diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index b35997b..bf16d0f 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -27,10 +27,11 @@ parsing quirks from older RFCs are retained. The testcases in test_urlparse.py provides a good indicator of parsing behavior. """ +from collections import namedtuple +import functools import re import sys import types -import collections import warnings __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", @@ -81,15 +82,10 @@ scheme_chars = ('abcdefghijklmnopqrstuvwxyz' # Unsafe bytes to be removed per WHATWG spec _UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n'] -# XXX: Consider replacing with functools.lru_cache -MAX_CACHE_SIZE = 20 -_parse_cache = {} - def clear_cache(): - """Clear the parse cache and the quoters cache.""" - _parse_cache.clear() - _safe_quoters.clear() - + """Clear internal performance caches. Undocumented; some tests want it.""" + urlsplit.cache_clear() + _byte_quoter_factory.cache_clear() # Helpers for bytes handling # For 3.2, we deliberately require applications that @@ -243,8 +239,6 @@ class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes): return hostname, port -from collections import namedtuple - _DefragResultBase = namedtuple('DefragResult', 'url fragment') _SplitResultBase = namedtuple( 'SplitResult', 'scheme netloc path query fragment') @@ -434,6 +428,9 @@ def _checknetloc(netloc): raise ValueError("netloc '" + netloc + "' contains invalid " + "characters under NFKC normalization") +# typed=True avoids BytesWarnings being emitted during cache key +# comparison since this API supports both bytes and str input. +@functools.lru_cache(typed=True) def urlsplit(url, scheme='', allow_fragments=True): """Parse a URL into 5 components: <scheme>://<netloc>/<path>?<query>#<fragment> @@ -462,12 +459,6 @@ def urlsplit(url, scheme='', allow_fragments=True): scheme = scheme.replace(b, "") allow_fragments = bool(allow_fragments) - key = url, scheme, allow_fragments, type(url), type(scheme) - cached = _parse_cache.get(key, None) - if cached: - return _coerce_result(cached) - if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth - clear_cache() netloc = query = fragment = '' i = url.find(':') if i > 0: @@ -488,7 +479,6 @@ def urlsplit(url, scheme='', allow_fragments=True): url, query = url.split('?', 1) _checknetloc(netloc) v = SplitResult(scheme, netloc, url, query, fragment) - _parse_cache[key] = v return _coerce_result(v) def urlunparse(components): @@ -791,23 +781,30 @@ _ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' b'0123456789' b'_.-~') _ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE) -_safe_quoters = {} -class Quoter(collections.defaultdict): - """A mapping from bytes (in range(0,256)) to strings. +def __getattr__(name): + if name == 'Quoter': + warnings.warn('Deprecated in 3.11. ' + 'urllib.parse.Quoter will be removed in Python 3.14. ' + 'It was not intended to be a public API.', + DeprecationWarning, stacklevel=2) + return _Quoter + raise AttributeError(f'module {__name__!r} has no attribute {name!r}') + +class _Quoter(dict): + """A mapping from bytes numbers (in range(0,256)) to strings. String values are percent-encoded byte values, unless the key < 128, and - in the "safe" set (either the specified safe set, or default set). + in either of the specified safe set, or the always safe set. """ - # Keeps a cache internally, using defaultdict, for efficiency (lookups + # Keeps a cache internally, via __missing__, for efficiency (lookups # of cached keys don't call Python code at all). def __init__(self, safe): """safe: bytes object.""" self.safe = _ALWAYS_SAFE.union(safe) def __repr__(self): - # Without this, will just display as a defaultdict - return "<%s %r>" % (self.__class__.__name__, dict(self)) + return f"<Quoter {dict(self)!r}>" def __missing__(self, b): # Handle a cache miss. Store quoted string in cache and return. @@ -886,6 +883,11 @@ def quote_plus(string, safe='', encoding=None, errors=None): string = quote(string, safe + space, encoding, errors) return string.replace(' ', '+') +# Expectation: A typical program is unlikely to create more than 5 of these. +@functools.lru_cache +def _byte_quoter_factory(safe): + return _Quoter(safe).__getitem__ + def quote_from_bytes(bs, safe='/'): """Like quote(), but accepts a bytes object rather than a str, and does not perform string-to-bytes encoding. It always returns an ASCII string. @@ -899,13 +901,11 @@ def quote_from_bytes(bs, safe='/'): # Normalize 'safe' by converting to bytes and removing non-ASCII chars safe = safe.encode('ascii', 'ignore') else: + # List comprehensions are faster than generator expressions. safe = bytes([c for c in safe if c < 128]) if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe): return bs.decode() - try: - quoter = _safe_quoters[safe] - except KeyError: - _safe_quoters[safe] = quoter = Quoter(safe).__getitem__ + quoter = _byte_quoter_factory(safe) return ''.join([quoter(char) for char in bs]) def urlencode(query, doseq=False, safe='', encoding=None, errors=None, diff --git a/Misc/NEWS.d/next/Library/2021-05-01-15-43-37.bpo-44002.KLT_wd.rst b/Misc/NEWS.d/next/Library/2021-05-01-15-43-37.bpo-44002.KLT_wd.rst new file mode 100644 index 0000000..9d662d9 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2021-05-01-15-43-37.bpo-44002.KLT_wd.rst @@ -0,0 +1,5 @@ +:mod:`urllib.parse` now uses :func:`functool.lru_cache` for its internal URL +splitting and quoting caches instead of rolling its own like its the '90s. + +The undocumented internal :mod:`urllib.parse` ``Quoted`` class API is now +deprecated, for removal in 3.14. |