diff options
author | Gregory P. Smith <greg@krypto.org> | 2021-05-12 00:01:44 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-05-12 00:01:44 (GMT) |
commit | d597fdc5fd0e8aa73a783fea27287db669950c15 (patch) | |
tree | 2279592f52acf02a00a412edb72ff625efac232f /Lib | |
parent | e9d7f88d5643f7e6387bf994c130503766d7eb92 (diff) | |
download | cpython-d597fdc5fd0e8aa73a783fea27287db669950c15.zip cpython-d597fdc5fd0e8aa73a783fea27287db669950c15.tar.gz cpython-d597fdc5fd0e8aa73a783fea27287db669950c15.tar.bz2 |
bpo-44002: Switch to lru_cache in urllib.parse. (GH-25798)
Switch to lru_cache in urllib.parse.
urllib.parse now uses functool.lru_cache for its internal URL splitting and
quoting caches instead of rolling its own like its the 90s.
The undocumented internal Quoted class API is now deprecated
as it had no reason to be public and no existing OSS users were found.
The clear_cache() API remains undocumented but gets an explicit test as it
is used in a few projects' (twisted, gevent) tests as well as our own regrtest.
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/test/test_urlparse.py | 18 | ||||
-rw-r--r-- | Lib/urllib/parse.py | 58 |
2 files changed, 45 insertions, 31 deletions
diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index 31943f3..dff9a8e 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -1044,16 +1044,24 @@ class UrlParseTestCase(unittest.TestCase): self.assertEqual(p1.params, 'phone-context=+1-914-555') def test_Quoter_repr(self): - quoter = urllib.parse.Quoter(urllib.parse._ALWAYS_SAFE) + quoter = urllib.parse._Quoter(urllib.parse._ALWAYS_SAFE) self.assertIn('Quoter', repr(quoter)) + def test_clear_cache_for_code_coverage(self): + urllib.parse.clear_cache() + + def test_urllib_parse_getattr_failure(self): + """Test that urllib.parse.__getattr__() fails correctly.""" + with self.assertRaises(AttributeError): + unused = urllib.parse.this_does_not_exist + def test_all(self): expected = [] undocumented = { 'splitattr', 'splithost', 'splitnport', 'splitpasswd', 'splitport', 'splitquery', 'splittag', 'splittype', 'splituser', 'splitvalue', - 'Quoter', 'ResultBase', 'clear_cache', 'to_bytes', 'unwrap', + 'ResultBase', 'clear_cache', 'to_bytes', 'unwrap', } for name in dir(urllib.parse): if name.startswith('_') or name in undocumented: @@ -1245,6 +1253,12 @@ class Utility_Tests(unittest.TestCase): class DeprecationTest(unittest.TestCase): + def test_Quoter_deprecation(self): + with self.assertWarns(DeprecationWarning) as cm: + old_class = urllib.parse.Quoter + self.assertIs(old_class, urllib.parse._Quoter) + self.assertIn('Quoter will be removed', str(cm.warning)) + def test_splittype_deprecation(self): with self.assertWarns(DeprecationWarning) as cm: urllib.parse.splittype('') diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index b35997b..bf16d0f 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -27,10 +27,11 @@ parsing quirks from older RFCs are retained. The testcases in test_urlparse.py provides a good indicator of parsing behavior. """ +from collections import namedtuple +import functools import re import sys import types -import collections import warnings __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", @@ -81,15 +82,10 @@ scheme_chars = ('abcdefghijklmnopqrstuvwxyz' # Unsafe bytes to be removed per WHATWG spec _UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n'] -# XXX: Consider replacing with functools.lru_cache -MAX_CACHE_SIZE = 20 -_parse_cache = {} - def clear_cache(): - """Clear the parse cache and the quoters cache.""" - _parse_cache.clear() - _safe_quoters.clear() - + """Clear internal performance caches. Undocumented; some tests want it.""" + urlsplit.cache_clear() + _byte_quoter_factory.cache_clear() # Helpers for bytes handling # For 3.2, we deliberately require applications that @@ -243,8 +239,6 @@ class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes): return hostname, port -from collections import namedtuple - _DefragResultBase = namedtuple('DefragResult', 'url fragment') _SplitResultBase = namedtuple( 'SplitResult', 'scheme netloc path query fragment') @@ -434,6 +428,9 @@ def _checknetloc(netloc): raise ValueError("netloc '" + netloc + "' contains invalid " + "characters under NFKC normalization") +# typed=True avoids BytesWarnings being emitted during cache key +# comparison since this API supports both bytes and str input. +@functools.lru_cache(typed=True) def urlsplit(url, scheme='', allow_fragments=True): """Parse a URL into 5 components: <scheme>://<netloc>/<path>?<query>#<fragment> @@ -462,12 +459,6 @@ def urlsplit(url, scheme='', allow_fragments=True): scheme = scheme.replace(b, "") allow_fragments = bool(allow_fragments) - key = url, scheme, allow_fragments, type(url), type(scheme) - cached = _parse_cache.get(key, None) - if cached: - return _coerce_result(cached) - if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth - clear_cache() netloc = query = fragment = '' i = url.find(':') if i > 0: @@ -488,7 +479,6 @@ def urlsplit(url, scheme='', allow_fragments=True): url, query = url.split('?', 1) _checknetloc(netloc) v = SplitResult(scheme, netloc, url, query, fragment) - _parse_cache[key] = v return _coerce_result(v) def urlunparse(components): @@ -791,23 +781,30 @@ _ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' b'0123456789' b'_.-~') _ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE) -_safe_quoters = {} -class Quoter(collections.defaultdict): - """A mapping from bytes (in range(0,256)) to strings. +def __getattr__(name): + if name == 'Quoter': + warnings.warn('Deprecated in 3.11. ' + 'urllib.parse.Quoter will be removed in Python 3.14. ' + 'It was not intended to be a public API.', + DeprecationWarning, stacklevel=2) + return _Quoter + raise AttributeError(f'module {__name__!r} has no attribute {name!r}') + +class _Quoter(dict): + """A mapping from bytes numbers (in range(0,256)) to strings. String values are percent-encoded byte values, unless the key < 128, and - in the "safe" set (either the specified safe set, or default set). + in either of the specified safe set, or the always safe set. """ - # Keeps a cache internally, using defaultdict, for efficiency (lookups + # Keeps a cache internally, via __missing__, for efficiency (lookups # of cached keys don't call Python code at all). def __init__(self, safe): """safe: bytes object.""" self.safe = _ALWAYS_SAFE.union(safe) def __repr__(self): - # Without this, will just display as a defaultdict - return "<%s %r>" % (self.__class__.__name__, dict(self)) + return f"<Quoter {dict(self)!r}>" def __missing__(self, b): # Handle a cache miss. Store quoted string in cache and return. @@ -886,6 +883,11 @@ def quote_plus(string, safe='', encoding=None, errors=None): string = quote(string, safe + space, encoding, errors) return string.replace(' ', '+') +# Expectation: A typical program is unlikely to create more than 5 of these. +@functools.lru_cache +def _byte_quoter_factory(safe): + return _Quoter(safe).__getitem__ + def quote_from_bytes(bs, safe='/'): """Like quote(), but accepts a bytes object rather than a str, and does not perform string-to-bytes encoding. It always returns an ASCII string. @@ -899,13 +901,11 @@ def quote_from_bytes(bs, safe='/'): # Normalize 'safe' by converting to bytes and removing non-ASCII chars safe = safe.encode('ascii', 'ignore') else: + # List comprehensions are faster than generator expressions. safe = bytes([c for c in safe if c < 128]) if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe): return bs.decode() - try: - quoter = _safe_quoters[safe] - except KeyError: - _safe_quoters[safe] = quoter = Quoter(safe).__getitem__ + quoter = _byte_quoter_factory(safe) return ''.join([quoter(char) for char in bs]) def urlencode(query, doseq=False, safe='', encoding=None, errors=None, |