summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/test/test_urllib.py2
-rw-r--r--Lib/urllib/parse.py129
2 files changed, 70 insertions, 61 deletions
diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py
index d4630a8..f5a9d5d 100644
--- a/Lib/test/test_urllib.py
+++ b/Lib/test/test_urllib.py
@@ -465,7 +465,7 @@ class UnquotingTests(unittest.TestCase):
def test_unquote_with_unicode(self):
r = urllib.parse.unquote('br%C3%BCckner_sapporo_20050930.doc')
- self.assertEqual(r, 'br\u00FCckner_sapporo_20050930.doc')
+ self.assertEqual(r, 'br\xc3\xbcckner_sapporo_20050930.doc')
class urlencode_Tests(unittest.TestCase):
"""Tests for urlencode()"""
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
index f924a3a..fe02db5 100644
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -261,74 +261,84 @@ def urldefrag(url):
return url, ''
-def unquote_as_string (s, plus=False, charset=None):
- if charset is None:
- charset = "UTF-8"
- return str(unquote_as_bytes(s, plus=plus), charset, 'strict')
+_hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
+_hextochr.update(('%02X' % i, chr(i)) for i in range(256))
-def unquote_as_bytes (s, plus=False):
+def unquote(s):
"""unquote('abc%20def') -> 'abc def'."""
- if plus:
- s = s.replace('+', ' ')
res = s.split('%')
- res[0] = res[0].encode('ASCII', 'strict')
for i in range(1, len(res)):
- res[i] = (bytes.fromhex(res[i][:2]) +
- res[i][2:].encode('ASCII', 'strict'))
- return b''.join(res)
-
-_always_safe = (b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
- b'abcdefghijklmnopqrstuvwxyz'
- b'0123456789'
- b'_.-')
-
-_percent_code = ord('%')
-
-_hextable = b'0123456789ABCDEF'
-
-def quote_as_bytes(s, safe = '/', plus=False):
- """quote(b'abc@def') -> 'abc%40def'"""
-
- if isinstance(s, str):
- s = s.encode("UTF-8", "strict")
- if not (isinstance(s, bytes) or isinstance(s, bytearray)):
- raise ValueError("Argument to quote must be either bytes "
- "or bytearray; string arguments will be "
- "converted to UTF-8 bytes")
-
- safeset = _always_safe + safe.encode('ASCII', 'strict')
- if plus:
- safeset += b' '
-
- result = bytearray()
- for i in s:
- if i not in safeset:
- result.append(_percent_code)
- result.append(_hextable[(i >> 4) & 0xF])
- result.append(_hextable[i & 0xF])
- else:
- result.append(i)
- if plus:
- result = result.replace(b' ', b'+')
- return result
+ item = res[i]
+ try:
+ res[i] = _hextochr[item[:2]] + item[2:]
+ except KeyError:
+ res[i] = '%' + item
+ except UnicodeDecodeError:
+ res[i] = chr(int(item[:2], 16)) + item[2:]
+ return "".join(res)
-def quote_as_string(s, safe = '/', plus=False):
- return str(quote_as_bytes(s, safe=safe, plus=plus), 'ASCII', 'strict')
+def unquote_plus(s):
+ """unquote('%7e/abc+def') -> '~/abc def'"""
+ s = s.replace('+', ' ')
+ return unquote(s)
-# finally, define defaults for 'quote' and 'unquote'
+always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+ 'abcdefghijklmnopqrstuvwxyz'
+ '0123456789' '_.-')
+_safe_quoters= {}
-def quote(s, safe='/'):
- return quote_as_string(s, safe=safe)
+class Quoter:
+ def __init__(self, safe):
+ self.cache = {}
+ self.safe = safe + always_safe
-def quote_plus(s, safe=''):
- return quote_as_string(s, safe=safe, plus=True)
+ def __call__(self, c):
+ try:
+ return self.cache[c]
+ except KeyError:
+ if ord(c) < 256:
+ res = (c in self.safe) and c or ('%%%02X' % ord(c))
+ self.cache[c] = res
+ return res
+ else:
+ return "".join(['%%%02X' % i for i in c.encode("utf-8")])
-def unquote(s):
- return unquote_as_string(s)
+def quote(s, safe = '/'):
+ """quote('abc def') -> 'abc%20def'
-def unquote_plus(s):
- return unquote_as_string(s, plus=True)
+ Each part of a URL, e.g. the path info, the query, etc., has a
+ different set of reserved characters that must be quoted.
+
+ RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
+ the following reserved characters.
+ reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
+ "$" | ","
+
+ Each of these characters is reserved in some component of a URL,
+ but not necessarily in all of them.
+
+ By default, the quote function is intended for quoting the path
+ section of a URL. Thus, it will not encode '/'. This character
+ is reserved, but in typical usage the quote function is being
+ called on a path where the existing slash characters are used as
+ reserved characters.
+ """
+ cachekey = (safe, always_safe)
+ try:
+ quoter = _safe_quoters[cachekey]
+ except KeyError:
+ quoter = Quoter(safe)
+ _safe_quoters[cachekey] = quoter
+ res = map(quoter, s)
+ return ''.join(res)
+
+def quote_plus(s, safe = ''):
+ """Quote the query fragment of a URL; replacing ' ' with '+'"""
+ if ' ' in s:
+ s = quote(s, safe + ' ')
+ return s.replace(' ', '+')
+ return quote(s, safe)
def urlencode(query,doseq=0):
"""Encode a sequence of two-element tuples or dictionary into a URL query string.
@@ -377,7 +387,7 @@ def urlencode(query,doseq=0):
# is there a reasonable way to convert to ASCII?
# encode generates a string, but "replace" or "ignore"
# lose information and "strict" can raise UnicodeError
- v = quote_plus(v)
+ v = quote_plus(v.encode("ASCII","replace"))
l.append(k + '=' + v)
else:
try:
@@ -464,8 +474,7 @@ def splituser(host):
_userprog = re.compile('^(.*)@(.*)$')
match = _userprog.match(host)
- if match:
- return map(unquote, match.group(1, 2))
+ if match: return map(unquote, match.group(1, 2))
return None, host
_passwdprog = None