From 37ddbb8abdef2406612455c4083cf6ee82926875 Mon Sep 17 00:00:00 2001 From: Florent Xicluna Date: Sat, 14 Aug 2010 21:06:29 +0000 Subject: Merged revisions 76719,81270-81272,83294,83319,84038-84039 via svnmerge from svn+ssh://pythondev@svn.python.org/python/branches/py3k MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ................ r76719 | antoine.pitrou | 2009-12-08 20:38:17 +0100 (mar., 08 déc. 2009) | 9 lines Merged revisions 76718 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r76718 | antoine.pitrou | 2009-12-08 20:35:12 +0100 (mar., 08 déc. 2009) | 3 lines Fix transient refleaks in test_urllib. Thanks to Florent Xicluna. ........ ................ r81270 | florent.xicluna | 2010-05-17 19:24:07 +0200 (lun., 17 mai 2010) | 9 lines Merged revision 81259 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r81259 | florent.xicluna | 2010-05-17 12:39:07 +0200 (lun, 17 mai 2010) | 2 lines Slight style cleanup. ........ ................ r81271 | florent.xicluna | 2010-05-17 19:33:07 +0200 (lun., 17 mai 2010) | 11 lines Issue #1285086: Speed up urllib.parse functions: quote, quote_from_bytes, unquote, unquote_to_bytes. Recorded merge of revisions 81265 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r81265 | florent.xicluna | 2010-05-17 15:35:09 +0200 (lun, 17 mai 2010) | 2 lines Issue #1285086: Speed up urllib.quote and urllib.unquote for simple cases. ........ ................ r81272 | florent.xicluna | 2010-05-17 20:01:22 +0200 (lun., 17 mai 2010) | 2 lines Inadvertently removed part of the comment in r81271. ................ r83294 | senthil.kumaran | 2010-07-30 21:34:36 +0200 (ven., 30 juil. 2010) | 2 lines Fix issue9301 - handle unquote({}) kind of case. ................ r83319 | florent.xicluna | 2010-07-31 10:56:55 +0200 (sam., 31 juil. 2010) | 2 lines Fix an oversight in r83294. unquote() should reject bytes. Issue #9301. ................ r84038 | florent.xicluna | 2010-08-14 20:30:35 +0200 (sam., 14 août 2010) | 1 line Silence the BytesWarning, due to patch r83294 for #9301 ................ r84039 | florent.xicluna | 2010-08-14 22:51:58 +0200 (sam., 14 août 2010) | 1 line Silence BytesWarning while testing exception ................ --- Lib/test/test_urllib.py | 16 +++++--- Lib/urllib/parse.py | 98 ++++++++++++++++++++++++++++--------------------- Lib/urllib/request.py | 2 +- Misc/NEWS | 3 ++ 4 files changed, 72 insertions(+), 47 deletions(-) diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 80cd8ef..775d810 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -261,8 +261,8 @@ class urlretrieve_FileTests(unittest.TestCase): result = urllib.request.urlretrieve("file:%s" % support.TESTFN) self.assertEqual(result[0], support.TESTFN) self.assertTrue(isinstance(result[1], email.message.Message), - "did not get a email.message.Message instance as second " - "returned value") + "did not get a email.message.Message instance " + "as second returned value") def test_copy(self): # Test that setting the filename argument works. @@ -539,6 +539,7 @@ class QuotingTests(unittest.TestCase): self.assertEqual(expect, result, "using quote_plus(): %r != %r" % (expect, result)) + class UnquotingTests(unittest.TestCase): """Tests for unquote() and unquote_plus() @@ -566,6 +567,10 @@ class UnquotingTests(unittest.TestCase): self.assertEqual(result.count('%'), 1, "using unquote(): not all characters escaped: " "%s" % result) + self.assertRaises((TypeError, AttributeError), urllib.parse.unquote, None) + self.assertRaises((TypeError, AttributeError), urllib.parse.unquote, ()) + with support.check_warnings(): + self.assertRaises((TypeError, AttributeError), urllib.parse.unquote, b'') def test_unquoting_badpercent(self): # Test unquoting on bad percent-escapes @@ -600,6 +605,8 @@ class UnquotingTests(unittest.TestCase): result = urllib.parse.unquote_to_bytes(given) self.assertEqual(expect, result, "using unquote_to_bytes(): %r != %r" % (expect, result)) + self.assertRaises((TypeError, AttributeError), urllib.parse.unquote_to_bytes, None) + self.assertRaises((TypeError, AttributeError), urllib.parse.unquote_to_bytes, ()) def test_unquoting_mixed_case(self): # Test unquoting on mixed-case hex digits in the percent-escapes @@ -741,7 +748,7 @@ class urlencode_Tests(unittest.TestCase): expect_somewhere = ["1st=1", "2nd=2", "3rd=3"] result = urllib.parse.urlencode(given) for expected in expect_somewhere: - self.assertTrue(expected in result, + self.assertIn(expected, result, "testing %s: %s not found in %s" % (test_type, expected, result)) self.assertEqual(result.count('&'), 2, @@ -788,8 +795,7 @@ class urlencode_Tests(unittest.TestCase): result = urllib.parse.urlencode(given, True) for value in given["sequence"]: expect = "sequence=%s" % value - self.assertTrue(expect in result, - "%s not found in %s" % (expect, result)) + self.assertIn(expect, result) self.assertEqual(result.count('&'), 2, "Expected 2 '&'s, got %s" % result.count('&')) diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 886c51c..765f1c8 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -39,7 +39,7 @@ uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'imap', 'wais', 'file', 'mms', 'https', 'shttp', 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '', - 'svn', 'svn+ssh', 'sftp', 'nfs',' git', 'git+ssh'] + 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh'] non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap', @@ -61,8 +61,9 @@ MAX_CACHE_SIZE = 20 _parse_cache = {} def clear_cache(): - """Clear the parse cache.""" + """Clear the parse cache and the quoters cache.""" _parse_cache.clear() + _safe_quoters.clear() class ResultMixin(object): @@ -302,17 +303,22 @@ def unquote_to_bytes(string): """unquote_to_bytes('abc%20def') -> b'abc def'.""" # Note: strings are encoded as UTF-8. This is only an issue if it contains # unescaped non-ASCII characters, which URIs should not. + if not string: + # Is it a string-like object? + string.split + return b'' if isinstance(string, str): string = string.encode('utf-8') res = string.split(b'%') - res[0] = res[0] - for i in range(1, len(res)): - item = res[i] + if len(res) == 1: + return string + string = res[0] + for item in res[1:]: try: - res[i] = bytes([int(item[:2], 16)]) + item[2:] + string += bytes([int(item[:2], 16)]) + item[2:] except ValueError: - res[i] = b'%' + item - return b''.join(res) + string += b'%' + item + return string def unquote(string, encoding='utf-8', errors='replace'): """Replace %xx escapes by their single-character equivalent. The optional @@ -324,36 +330,39 @@ def unquote(string, encoding='utf-8', errors='replace'): unquote('abc%20def') -> 'abc def'. """ - if encoding is None: encoding = 'utf-8' - if errors is None: errors = 'replace' - # pct_sequence: contiguous sequence of percent-encoded bytes, decoded - # (list of single-byte bytes objects) - pct_sequence = [] + if string == '': + return string res = string.split('%') - for i in range(1, len(res)): - item = res[i] + if len(res) == 1: + return string + if encoding is None: + encoding = 'utf-8' + if errors is None: + errors = 'replace' + # pct_sequence: contiguous sequence of percent-encoded bytes, decoded + pct_sequence = b'' + string = res[0] + for item in res[1:]: try: - if not item: raise ValueError - pct_sequence.append(bytes.fromhex(item[:2])) + if not item: + raise ValueError + pct_sequence += bytes.fromhex(item[:2]) rest = item[2:] + if not rest: + # This segment was just a single percent-encoded character. + # May be part of a sequence of code units, so delay decoding. + # (Stored in pct_sequence). + continue except ValueError: rest = '%' + item - if not rest: - # This segment was just a single percent-encoded character. - # May be part of a sequence of code units, so delay decoding. - # (Stored in pct_sequence). - res[i] = '' - else: - # Encountered non-percent-encoded characters. Flush the current - # pct_sequence. - res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest - pct_sequence = [] + # Encountered non-percent-encoded characters. Flush the current + # pct_sequence. + string += pct_sequence.decode(encoding, errors) + rest + pct_sequence = b'' if pct_sequence: # Flush the final pct_sequence - # res[-1] will always be empty if pct_sequence != [] - assert not res[-1], "string=%r, res=%r" % (string, res) - res[-1] = b''.join(pct_sequence).decode(encoding, errors) - return ''.join(res) + string += pct_sequence.decode(encoding, errors) + return string def parse_qs(qs, keep_blank_values=False, strict_parsing=False): """Parse a query given as a string argument. @@ -434,7 +443,8 @@ _ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' b'abcdefghijklmnopqrstuvwxyz' b'0123456789' b'_.-') -_safe_quoters= {} +_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE) +_safe_quoters = {} class Quoter(collections.defaultdict): """A mapping from bytes (in range(0,256)) to strings. @@ -446,7 +456,7 @@ class Quoter(collections.defaultdict): # of cached keys don't call Python code at all). def __init__(self, safe): """safe: bytes object.""" - self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128) + self.safe = _ALWAYS_SAFE.union(safe) def __repr__(self): # Without this, will just display as a defaultdict @@ -454,7 +464,7 @@ class Quoter(collections.defaultdict): def __missing__(self, b): # Handle a cache miss. Store quoted string in cache and return. - res = b in self.safe and chr(b) or ('%%%02X' % b) + res = chr(b) if b in self.safe else '%{:02X}'.format(b) self[b] = res return res @@ -488,6 +498,8 @@ def quote(string, safe='/', encoding=None, errors=None): errors='strict' (unsupported characters raise a UnicodeEncodeError). """ if isinstance(string, str): + if not string: + return string if encoding is None: encoding = 'utf-8' if errors is None: @@ -522,18 +534,22 @@ def quote_from_bytes(bs, safe='/'): not perform string-to-bytes encoding. It always returns an ASCII string. quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB' """ + if not isinstance(bs, (bytes, bytearray)): + raise TypeError("quote_from_bytes() expected bytes") + if not bs: + return '' if isinstance(safe, str): # Normalize 'safe' by converting to bytes and removing non-ASCII chars safe = safe.encode('ascii', 'ignore') - cachekey = bytes(safe) # In case it was a bytearray - if not (isinstance(bs, bytes) or isinstance(bs, bytearray)): - raise TypeError("quote_from_bytes() expected a bytes") + else: + safe = bytes([c for c in safe if c < 128]) + if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe): + return bs.decode() try: - quoter = _safe_quoters[cachekey] + quoter = _safe_quoters[safe] except KeyError: - quoter = Quoter(safe) - _safe_quoters[cachekey] = quoter - return ''.join([quoter[char] for char in bs]) + _safe_quoters[safe] = quoter = Quoter(safe).__getitem__ + return ''.join([quoter(char) for char in bs]) def urlencode(query, doseq=False, safe='', encoding=None, errors=None): """Encode a sequence of two-element tuples or dictionary into a URL query string. diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index 3ea38f2..0e62d9f 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -1988,7 +1988,7 @@ class FancyURLopener(URLopener): else: return self.open(newurl, data) - def get_user_passwd(self, host, realm, clear_cache = 0): + def get_user_passwd(self, host, realm, clear_cache=0): key = realm + '@' + host.lower() if key in self.auth_cache: if clear_cache: diff --git a/Misc/NEWS b/Misc/NEWS index 478517b..3432993 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -291,6 +291,9 @@ Library compilation in a non-ASCII directory if stdout encoding is ASCII (eg. if stdout is not a TTY). +- Issue #1285086: Speed up urllib.parse functions: quote, quote_from_bytes, + unquote, unquote_to_bytes. + - Issue #8688: Distutils now recalculates MANIFEST everytime. - Issue #5099: subprocess.Popen.__del__ no longer references global objects -- cgit v0.12