From 37ddbb8abdef2406612455c4083cf6ee82926875 Mon Sep 17 00:00:00 2001
From: Florent Xicluna <florent.xicluna@gmail.com>
Date: Sat, 14 Aug 2010 21:06:29 +0000
Subject: Merged revisions 76719,81270-81272,83294,83319,84038-84039 via
 svnmerge from svn+ssh://pythondev@svn.python.org/python/branches/py3k
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

................
  r76719 | antoine.pitrou | 2009-12-08 20:38:17 +0100 (mar., 08 déc. 2009) | 9 lines

  Merged revisions 76718 via svnmerge from
  svn+ssh://pythondev@svn.python.org/python/trunk

  ........
    r76718 | antoine.pitrou | 2009-12-08 20:35:12 +0100 (mar., 08 déc. 2009) | 3 lines

    Fix transient refleaks in test_urllib. Thanks to Florent Xicluna.
  ........
................
  r81270 | florent.xicluna | 2010-05-17 19:24:07 +0200 (lun., 17 mai 2010) | 9 lines

  Merged revision 81259 via svnmerge from
  svn+ssh://pythondev@svn.python.org/python/trunk

  ........
    r81259 | florent.xicluna | 2010-05-17 12:39:07 +0200 (lun, 17 mai 2010) | 2 lines

    Slight style cleanup.
  ........
................
  r81271 | florent.xicluna | 2010-05-17 19:33:07 +0200 (lun., 17 mai 2010) | 11 lines

  Issue #1285086: Speed up urllib.parse functions: quote, quote_from_bytes, unquote, unquote_to_bytes.

  Recorded merge of revisions 81265 via svnmerge from
  svn+ssh://pythondev@svn.python.org/python/trunk

  ........
    r81265 | florent.xicluna | 2010-05-17 15:35:09 +0200 (lun, 17 mai 2010) | 2 lines

    Issue #1285086: Speed up urllib.quote and urllib.unquote for simple cases.
  ........
................
  r81272 | florent.xicluna | 2010-05-17 20:01:22 +0200 (lun., 17 mai 2010) | 2 lines

  Inadvertently removed part of the comment in r81271.
................
  r83294 | senthil.kumaran | 2010-07-30 21:34:36 +0200 (ven., 30 juil. 2010) | 2 lines

  Fix issue9301 - handle unquote({}) kind of case.
................
  r83319 | florent.xicluna | 2010-07-31 10:56:55 +0200 (sam., 31 juil. 2010) | 2 lines

  Fix an oversight in r83294.  unquote() should reject bytes.  Issue #9301.
................
  r84038 | florent.xicluna | 2010-08-14 20:30:35 +0200 (sam., 14 août 2010) | 1 line

  Silence the BytesWarning, due to patch r83294 for #9301
................
  r84039 | florent.xicluna | 2010-08-14 22:51:58 +0200 (sam., 14 août 2010) | 1 line

  Silence BytesWarning while testing exception
................
---
 Lib/test/test_urllib.py | 16 +++++---
 Lib/urllib/parse.py     | 98 ++++++++++++++++++++++++++++---------------------
 Lib/urllib/request.py   |  2 +-
 Misc/NEWS               |  3 ++
 4 files changed, 72 insertions(+), 47 deletions(-)

diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py
index 80cd8ef..775d810 100644
--- a/Lib/test/test_urllib.py
+++ b/Lib/test/test_urllib.py
@@ -261,8 +261,8 @@ class urlretrieve_FileTests(unittest.TestCase):
         result = urllib.request.urlretrieve("file:%s" % support.TESTFN)
         self.assertEqual(result[0], support.TESTFN)
         self.assertTrue(isinstance(result[1], email.message.Message),
-                     "did not get a email.message.Message instance as second "
-                     "returned value")
+                     "did not get a email.message.Message instance "
+                     "as second returned value")
 
     def test_copy(self):
         # Test that setting the filename argument works.
@@ -539,6 +539,7 @@ class QuotingTests(unittest.TestCase):
         self.assertEqual(expect, result,
                          "using quote_plus(): %r != %r" % (expect, result))
 
+
 class UnquotingTests(unittest.TestCase):
     """Tests for unquote() and unquote_plus()
 
@@ -566,6 +567,10 @@ class UnquotingTests(unittest.TestCase):
         self.assertEqual(result.count('%'), 1,
                          "using unquote(): not all characters escaped: "
                          "%s" % result)
+        self.assertRaises((TypeError, AttributeError), urllib.parse.unquote, None)
+        self.assertRaises((TypeError, AttributeError), urllib.parse.unquote, ())
+        with support.check_warnings():
+            self.assertRaises((TypeError, AttributeError), urllib.parse.unquote, b'')
 
     def test_unquoting_badpercent(self):
         # Test unquoting on bad percent-escapes
@@ -600,6 +605,8 @@ class UnquotingTests(unittest.TestCase):
         result = urllib.parse.unquote_to_bytes(given)
         self.assertEqual(expect, result, "using unquote_to_bytes(): %r != %r"
                          % (expect, result))
+        self.assertRaises((TypeError, AttributeError), urllib.parse.unquote_to_bytes, None)
+        self.assertRaises((TypeError, AttributeError), urllib.parse.unquote_to_bytes, ())
 
     def test_unquoting_mixed_case(self):
         # Test unquoting on mixed-case hex digits in the percent-escapes
@@ -741,7 +748,7 @@ class urlencode_Tests(unittest.TestCase):
         expect_somewhere = ["1st=1", "2nd=2", "3rd=3"]
         result = urllib.parse.urlencode(given)
         for expected in expect_somewhere:
-            self.assertTrue(expected in result,
+            self.assertIn(expected, result,
                          "testing %s: %s not found in %s" %
                          (test_type, expected, result))
         self.assertEqual(result.count('&'), 2,
@@ -788,8 +795,7 @@ class urlencode_Tests(unittest.TestCase):
         result = urllib.parse.urlencode(given, True)
         for value in given["sequence"]:
             expect = "sequence=%s" % value
-            self.assertTrue(expect in result,
-                         "%s not found in %s" % (expect, result))
+            self.assertIn(expect, result)
         self.assertEqual(result.count('&'), 2,
                          "Expected 2 '&'s, got %s" % result.count('&'))
 
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
index 886c51c..765f1c8 100644
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -39,7 +39,7 @@ uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
                'imap', 'wais', 'file', 'mms', 'https', 'shttp',
                'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
-               'svn', 'svn+ssh', 'sftp', 'nfs',' git', 'git+ssh']
+               'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
                     'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
 uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
@@ -61,8 +61,9 @@ MAX_CACHE_SIZE = 20
 _parse_cache = {}
 
 def clear_cache():
-    """Clear the parse cache."""
+    """Clear the parse cache and the quoters cache."""
     _parse_cache.clear()
+    _safe_quoters.clear()
 
 
 class ResultMixin(object):
@@ -302,17 +303,22 @@ def unquote_to_bytes(string):
     """unquote_to_bytes('abc%20def') -> b'abc def'."""
     # Note: strings are encoded as UTF-8. This is only an issue if it contains
     # unescaped non-ASCII characters, which URIs should not.
+    if not string:
+        # Is it a string-like object?
+        string.split
+        return b''
     if isinstance(string, str):
         string = string.encode('utf-8')
     res = string.split(b'%')
-    res[0] = res[0]
-    for i in range(1, len(res)):
-        item = res[i]
+    if len(res) == 1:
+        return string
+    string = res[0]
+    for item in res[1:]:
         try:
-            res[i] = bytes([int(item[:2], 16)]) + item[2:]
+            string += bytes([int(item[:2], 16)]) + item[2:]
         except ValueError:
-            res[i] = b'%' + item
-    return b''.join(res)
+            string += b'%' + item
+    return string
 
 def unquote(string, encoding='utf-8', errors='replace'):
     """Replace %xx escapes by their single-character equivalent. The optional
@@ -324,36 +330,39 @@ def unquote(string, encoding='utf-8', errors='replace'):
 
     unquote('abc%20def') -> 'abc def'.
     """
-    if encoding is None: encoding = 'utf-8'
-    if errors is None: errors = 'replace'
-    # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
-    # (list of single-byte bytes objects)
-    pct_sequence = []
+    if string == '':
+        return string
     res = string.split('%')
-    for i in range(1, len(res)):
-        item = res[i]
+    if len(res) == 1:
+        return string
+    if encoding is None:
+        encoding = 'utf-8'
+    if errors is None:
+        errors = 'replace'
+    # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
+    pct_sequence = b''
+    string = res[0]
+    for item in res[1:]:
         try:
-            if not item: raise ValueError
-            pct_sequence.append(bytes.fromhex(item[:2]))
+            if not item:
+                raise ValueError
+            pct_sequence += bytes.fromhex(item[:2])
             rest = item[2:]
+            if not rest:
+                # This segment was just a single percent-encoded character.
+                # May be part of a sequence of code units, so delay decoding.
+                # (Stored in pct_sequence).
+                continue
         except ValueError:
             rest = '%' + item
-        if not rest:
-            # This segment was just a single percent-encoded character.
-            # May be part of a sequence of code units, so delay decoding.
-            # (Stored in pct_sequence).
-            res[i] = ''
-        else:
-            # Encountered non-percent-encoded characters. Flush the current
-            # pct_sequence.
-            res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
-            pct_sequence = []
+        # Encountered non-percent-encoded characters. Flush the current
+        # pct_sequence.
+        string += pct_sequence.decode(encoding, errors) + rest
+        pct_sequence = b''
     if pct_sequence:
         # Flush the final pct_sequence
-        # res[-1] will always be empty if pct_sequence != []
-        assert not res[-1], "string=%r, res=%r" % (string, res)
-        res[-1] = b''.join(pct_sequence).decode(encoding, errors)
-    return ''.join(res)
+        string += pct_sequence.decode(encoding, errors)
+    return string
 
 def parse_qs(qs, keep_blank_values=False, strict_parsing=False):
     """Parse a query given as a string argument.
@@ -434,7 +443,8 @@ _ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
                          b'abcdefghijklmnopqrstuvwxyz'
                          b'0123456789'
                          b'_.-')
-_safe_quoters= {}
+_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
+_safe_quoters = {}
 
 class Quoter(collections.defaultdict):
     """A mapping from bytes (in range(0,256)) to strings.
@@ -446,7 +456,7 @@ class Quoter(collections.defaultdict):
     # of cached keys don't call Python code at all).
     def __init__(self, safe):
         """safe: bytes object."""
-        self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
+        self.safe = _ALWAYS_SAFE.union(safe)
 
     def __repr__(self):
         # Without this, will just display as a defaultdict
@@ -454,7 +464,7 @@ class Quoter(collections.defaultdict):
 
     def __missing__(self, b):
         # Handle a cache miss. Store quoted string in cache and return.
-        res = b in self.safe and chr(b) or ('%%%02X' % b)
+        res = chr(b) if b in self.safe else '%{:02X}'.format(b)
         self[b] = res
         return res
 
@@ -488,6 +498,8 @@ def quote(string, safe='/', encoding=None, errors=None):
     errors='strict' (unsupported characters raise a UnicodeEncodeError).
     """
     if isinstance(string, str):
+        if not string:
+            return string
         if encoding is None:
             encoding = 'utf-8'
         if errors is None:
@@ -522,18 +534,22 @@ def quote_from_bytes(bs, safe='/'):
     not perform string-to-bytes encoding.  It always returns an ASCII string.
     quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
     """
+    if not isinstance(bs, (bytes, bytearray)):
+        raise TypeError("quote_from_bytes() expected bytes")
+    if not bs:
+        return ''
     if isinstance(safe, str):
         # Normalize 'safe' by converting to bytes and removing non-ASCII chars
         safe = safe.encode('ascii', 'ignore')
-    cachekey = bytes(safe)  # In case it was a bytearray
-    if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
-        raise TypeError("quote_from_bytes() expected a bytes")
+    else:
+        safe = bytes([c for c in safe if c < 128])
+    if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
+        return bs.decode()
     try:
-        quoter = _safe_quoters[cachekey]
+        quoter = _safe_quoters[safe]
     except KeyError:
-        quoter = Quoter(safe)
-        _safe_quoters[cachekey] = quoter
-    return ''.join([quoter[char] for char in bs])
+        _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
+    return ''.join([quoter(char) for char in bs])
 
 def urlencode(query, doseq=False, safe='', encoding=None, errors=None):
     """Encode a sequence of two-element tuples or dictionary into a URL query string.
diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py
index 3ea38f2..0e62d9f 100644
--- a/Lib/urllib/request.py
+++ b/Lib/urllib/request.py
@@ -1988,7 +1988,7 @@ class FancyURLopener(URLopener):
         else:
             return self.open(newurl, data)
 
-    def get_user_passwd(self, host, realm, clear_cache = 0):
+    def get_user_passwd(self, host, realm, clear_cache=0):
         key = realm + '@' + host.lower()
         if key in self.auth_cache:
             if clear_cache:
diff --git a/Misc/NEWS b/Misc/NEWS
index 478517b..3432993 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -291,6 +291,9 @@ Library
   compilation in a non-ASCII directory if stdout encoding is ASCII (eg. if
   stdout is not a TTY).
 
+- Issue #1285086: Speed up urllib.parse functions: quote, quote_from_bytes,
+  unquote, unquote_to_bytes.
+
 - Issue #8688: Distutils now recalculates MANIFEST everytime.
 
 - Issue #5099: subprocess.Popen.__del__ no longer references global objects
-- 
cgit v0.12