From 923baea9f921e829ece677e32c45a1a91acb3bef Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 14 Mar 2013 21:31:09 +0200 Subject: Issue #1285086: Get rid of the refcounting hack and speed up urllib.unquote(). --- Lib/urllib.py | 32 +++++++++++++++++++++++--------- Lib/urlparse.py | 42 +++++++++++++++++++++++++++++++++--------- Misc/NEWS | 2 ++ 3 files changed, 58 insertions(+), 18 deletions(-) diff --git a/Lib/urllib.py b/Lib/urllib.py index 33641a5..f9655f9 100644 --- a/Lib/urllib.py +++ b/Lib/urllib.py @@ -28,6 +28,7 @@ import os import time import sys import base64 +import re from urlparse import urljoin as basejoin @@ -1198,22 +1199,35 @@ def splitvalue(attr): _hexdig = '0123456789ABCDEFabcdef' _hextochr = dict((a + b, chr(int(a + b, 16))) for a in _hexdig for b in _hexdig) +_asciire = re.compile('([\x00-\x7f]+)') def unquote(s): """unquote('abc%20def') -> 'abc def'.""" - res = s.split('%') + if _is_unicode(s): + if '%' not in s: + return s + bits = _asciire.split(s) + res = [bits[0]] + append = res.append + for i in range(1, len(bits), 2): + append(unquote(str(bits[i])).decode('latin1')) + append(bits[i + 1]) + return ''.join(res) + + bits = s.split('%') # fastpath - if len(res) == 1: + if len(bits) == 1: return s - s = res[0] - for item in res[1:]: + res = [bits[0]] + append = res.append + for item in bits[1:]: try: - s += _hextochr[item[:2]] + item[2:] + append(_hextochr[item[:2]]) + append(item[2:]) except KeyError: - s += '%' + item - except UnicodeDecodeError: - s += unichr(int(item[:2], 16)) + item[2:] - return s + append('%') + append(item) + return ''.join(res) def unquote_plus(s): """unquote('%7e/abc+def') -> '~/abc def'""" diff --git a/Lib/urlparse.py b/Lib/urlparse.py index f370ce3..4ce982e 100644 --- a/Lib/urlparse.py +++ b/Lib/urlparse.py @@ -28,6 +28,8 @@ test_urlparse.py provides a good indicator of parsing behavior. """ +import re + __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"] @@ -311,6 +313,15 @@ def urldefrag(url): else: return url, '' +try: + unicode +except NameError: + def _is_unicode(x): + return 0 +else: + def _is_unicode(x): + return isinstance(x, unicode) + # unquote method for parse_qs and parse_qsl # Cannot use directly from urllib as it would create a circular reference # because urllib uses urlparse methods (urljoin). If you update this function, @@ -319,22 +330,35 @@ def urldefrag(url): _hexdig = '0123456789ABCDEFabcdef' _hextochr = dict((a+b, chr(int(a+b,16))) for a in _hexdig for b in _hexdig) +_asciire = re.compile('([\x00-\x7f]+)') def unquote(s): """unquote('abc%20def') -> 'abc def'.""" - res = s.split('%') + if _is_unicode(s): + if '%' not in s: + return s + bits = _asciire.split(s) + res = [bits[0]] + append = res.append + for i in range(1, len(bits), 2): + append(unquote(str(bits[i])).decode('latin1')) + append(bits[i + 1]) + return ''.join(res) + + bits = s.split('%') # fastpath - if len(res) == 1: + if len(bits) == 1: return s - s = res[0] - for item in res[1:]: + res = [bits[0]] + append = res.append + for item in bits[1:]: try: - s += _hextochr[item[:2]] + item[2:] + append(_hextochr[item[:2]]) + append(item[2:]) except KeyError: - s += '%' + item - except UnicodeDecodeError: - s += unichr(int(item[:2], 16)) + item[2:] - return s + append('%') + append(item) + return ''.join(res) def parse_qs(qs, keep_blank_values=0, strict_parsing=0): """Parse a query given as a string argument. diff --git a/Misc/NEWS b/Misc/NEWS index ae0402b..10c3bd0 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -214,6 +214,8 @@ Core and Builtins Library ------- +- Issue #1285086: Get rid of the refcounting hack and speed up urllib.unquote(). + - Issue #17368: Fix an off-by-one error in the Python JSON decoder that caused a failure while decoding empty object literals when object_pairs_hook was specified. -- cgit v0.12