diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2013-03-14 19:31:37 (GMT) |
---|---|---|
committer | Serhiy Storchaka <storchaka@gmail.com> | 2013-03-14 19:31:37 (GMT) |
commit | 8ea4616f16c8f89289e73849880499ef309ce18a (patch) | |
tree | 56df4eed0eda51947684a7b4ce894acb8741571c /Lib/urllib | |
parent | 3b220e11eb8e4768cbdef076d0ac133aaf4a2248 (diff) | |
download | cpython-8ea4616f16c8f89289e73849880499ef309ce18a.zip cpython-8ea4616f16c8f89289e73849880499ef309ce18a.tar.gz cpython-8ea4616f16c8f89289e73849880499ef309ce18a.tar.bz2 |
Issue #1285086: Get rid of the refcounting hack and speed up
urllib.parse.unquote() and urllib.parse.unquote_to_bytes().
Diffstat (limited to 'Lib/urllib')
-rw-r--r-- | Lib/urllib/parse.py | 63 |
1 files changed, 27 insertions, 36 deletions
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index dc75f8f..abe5d0d 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -27,6 +27,7 @@ parsing quirks from older RFCs are retained. The testcases in test_urlparse.py provides a good indicator of parsing behavior. """ +import re import sys import collections @@ -470,6 +471,10 @@ def urldefrag(url): defrag = url return _coerce_result(DefragResult(defrag, frag)) +_hexdig = '0123456789ABCDEFabcdef' +_hextobyte = {(a + b).encode(): bytes([int(a + b, 16)]) + for a in _hexdig for b in _hexdig} + def unquote_to_bytes(string): """unquote_to_bytes('abc%20def') -> b'abc def'.""" # Note: strings are encoded as UTF-8. This is only an issue if it contains @@ -480,16 +485,21 @@ def unquote_to_bytes(string): return b'' if isinstance(string, str): string = string.encode('utf-8') - res = string.split(b'%') - if len(res) == 1: + bits = string.split(b'%') + if len(bits) == 1: return string - string = res[0] - for item in res[1:]: + res = [bits[0]] + append = res.append + for item in bits[1:]: try: - string += bytes([int(item[:2], 16)]) + item[2:] - except ValueError: - string += b'%' + item - return string + append(_hextobyte[item[:2]]) + append(item[2:]) + except KeyError: + append(b'%') + append(item) + return b''.join(res) + +_asciire = re.compile('([\x00-\x7f]+)') def unquote(string, encoding='utf-8', errors='replace'): """Replace %xx escapes by their single-character equivalent. The optional @@ -501,39 +511,20 @@ def unquote(string, encoding='utf-8', errors='replace'): unquote('abc%20def') -> 'abc def'. """ - if string == '': - return string - res = string.split('%') - if len(res) == 1: + if '%' not in string: + string.split return string if encoding is None: encoding = 'utf-8' if errors is None: errors = 'replace' - # pct_sequence: contiguous sequence of percent-encoded bytes, decoded - pct_sequence = b'' - string = res[0] - for item in res[1:]: - try: - if not item: - raise ValueError - pct_sequence += bytes.fromhex(item[:2]) - rest = item[2:] - if not rest: - # This segment was just a single percent-encoded character. - # May be part of a sequence of code units, so delay decoding. - # (Stored in pct_sequence). - continue - except ValueError: - rest = '%' + item - # Encountered non-percent-encoded characters. Flush the current - # pct_sequence. - string += pct_sequence.decode(encoding, errors) + rest - pct_sequence = b'' - if pct_sequence: - # Flush the final pct_sequence - string += pct_sequence.decode(encoding, errors) - return string + bits = _asciire.split(string) + res = [bits[0]] + append = res.append + for i in range(1, len(bits), 2): + append(unquote_to_bytes(bits[i]).decode(encoding, errors)) + append(bits[i + 1]) + return ''.join(res) def parse_qs(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace'): |