From df9f1ecce610c1fc810b73caa8608fb85a22a7db Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 6 Aug 2008 19:31:34 +0000 Subject: Revert accidentally committed files. Oops! --- Lib/test/test_urllib.py | 2 +- Lib/urllib/parse.py | 129 ++++++++++++++++++++++++++---------------------- 2 files changed, 70 insertions(+), 61 deletions(-) diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index d4630a8..f5a9d5d 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -465,7 +465,7 @@ class UnquotingTests(unittest.TestCase): def test_unquote_with_unicode(self): r = urllib.parse.unquote('br%C3%BCckner_sapporo_20050930.doc') - self.assertEqual(r, 'br\u00FCckner_sapporo_20050930.doc') + self.assertEqual(r, 'br\xc3\xbcckner_sapporo_20050930.doc') class urlencode_Tests(unittest.TestCase): """Tests for urlencode()""" diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index f924a3a..fe02db5 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -261,74 +261,84 @@ def urldefrag(url): return url, '' -def unquote_as_string (s, plus=False, charset=None): - if charset is None: - charset = "UTF-8" - return str(unquote_as_bytes(s, plus=plus), charset, 'strict') +_hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) +_hextochr.update(('%02X' % i, chr(i)) for i in range(256)) -def unquote_as_bytes (s, plus=False): +def unquote(s): """unquote('abc%20def') -> 'abc def'.""" - if plus: - s = s.replace('+', ' ') res = s.split('%') - res[0] = res[0].encode('ASCII', 'strict') for i in range(1, len(res)): - res[i] = (bytes.fromhex(res[i][:2]) + - res[i][2:].encode('ASCII', 'strict')) - return b''.join(res) - -_always_safe = (b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' - b'abcdefghijklmnopqrstuvwxyz' - b'0123456789' - b'_.-') - -_percent_code = ord('%') - -_hextable = b'0123456789ABCDEF' - -def quote_as_bytes(s, safe = '/', plus=False): - """quote(b'abc@def') -> 'abc%40def'""" - - if isinstance(s, str): - s = s.encode("UTF-8", "strict") - if not (isinstance(s, bytes) or isinstance(s, bytearray)): - raise ValueError("Argument to quote must be either bytes " - "or bytearray; string arguments will be " - "converted to UTF-8 bytes") - - safeset = _always_safe + safe.encode('ASCII', 'strict') - if plus: - safeset += b' ' - - result = bytearray() - for i in s: - if i not in safeset: - result.append(_percent_code) - result.append(_hextable[(i >> 4) & 0xF]) - result.append(_hextable[i & 0xF]) - else: - result.append(i) - if plus: - result = result.replace(b' ', b'+') - return result + item = res[i] + try: + res[i] = _hextochr[item[:2]] + item[2:] + except KeyError: + res[i] = '%' + item + except UnicodeDecodeError: + res[i] = chr(int(item[:2], 16)) + item[2:] + return "".join(res) -def quote_as_string(s, safe = '/', plus=False): - return str(quote_as_bytes(s, safe=safe, plus=plus), 'ASCII', 'strict') +def unquote_plus(s): + """unquote('%7e/abc+def') -> '~/abc def'""" + s = s.replace('+', ' ') + return unquote(s) -# finally, define defaults for 'quote' and 'unquote' +always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' + 'abcdefghijklmnopqrstuvwxyz' + '0123456789' '_.-') +_safe_quoters= {} -def quote(s, safe='/'): - return quote_as_string(s, safe=safe) +class Quoter: + def __init__(self, safe): + self.cache = {} + self.safe = safe + always_safe -def quote_plus(s, safe=''): - return quote_as_string(s, safe=safe, plus=True) + def __call__(self, c): + try: + return self.cache[c] + except KeyError: + if ord(c) < 256: + res = (c in self.safe) and c or ('%%%02X' % ord(c)) + self.cache[c] = res + return res + else: + return "".join(['%%%02X' % i for i in c.encode("utf-8")]) -def unquote(s): - return unquote_as_string(s) +def quote(s, safe = '/'): + """quote('abc def') -> 'abc%20def' -def unquote_plus(s): - return unquote_as_string(s, plus=True) + Each part of a URL, e.g. the path info, the query, etc., has a + different set of reserved characters that must be quoted. + + RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists + the following reserved characters. + reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | + "$" | "," + + Each of these characters is reserved in some component of a URL, + but not necessarily in all of them. + + By default, the quote function is intended for quoting the path + section of a URL. Thus, it will not encode '/'. This character + is reserved, but in typical usage the quote function is being + called on a path where the existing slash characters are used as + reserved characters. + """ + cachekey = (safe, always_safe) + try: + quoter = _safe_quoters[cachekey] + except KeyError: + quoter = Quoter(safe) + _safe_quoters[cachekey] = quoter + res = map(quoter, s) + return ''.join(res) + +def quote_plus(s, safe = ''): + """Quote the query fragment of a URL; replacing ' ' with '+'""" + if ' ' in s: + s = quote(s, safe + ' ') + return s.replace(' ', '+') + return quote(s, safe) def urlencode(query,doseq=0): """Encode a sequence of two-element tuples or dictionary into a URL query string. @@ -377,7 +387,7 @@ def urlencode(query,doseq=0): # is there a reasonable way to convert to ASCII? # encode generates a string, but "replace" or "ignore" # lose information and "strict" can raise UnicodeError - v = quote_plus(v) + v = quote_plus(v.encode("ASCII","replace")) l.append(k + '=' + v) else: try: @@ -464,8 +474,7 @@ def splituser(host): _userprog = re.compile('^(.*)@(.*)$') match = _userprog.match(host) - if match: - return map(unquote, match.group(1, 2)) + if match: return map(unquote, match.group(1, 2)) return None, host _passwdprog = None -- cgit v0.12