diff options
Diffstat (limited to 'Lib/urllib/parse.py')
-rw-r--r-- | Lib/urllib/parse.py | 183 |
1 files changed, 136 insertions, 47 deletions
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 3e00695..94d77eb 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -5,9 +5,12 @@ UC Irvine, June 1995. """ import sys +import collections __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", - "urlsplit", "urlunsplit"] + "urlsplit", "urlunsplit", + "quote", "quote_plus", "quote_from_bytes", + "unquote", "unquote_plus", "unquote_to_bytes"] # A classification of schemes ('' means apply by default) uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', @@ -269,50 +272,101 @@ def urldefrag(url): else: return url, '' - -_hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) -_hextochr.update(('%02X' % i, chr(i)) for i in range(256)) - -def unquote(s): - """unquote('abc%20def') -> 'abc def'.""" - res = s.split('%') +def unquote_to_bytes(string): + """unquote_to_bytes('abc%20def') -> b'abc def'.""" + # Note: strings are encoded as UTF-8. This is only an issue if it contains + # unescaped non-ASCII characters, which URIs should not. + if isinstance(string, str): + string = string.encode('utf-8') + res = string.split(b'%') + res[0] = res[0] + for i in range(1, len(res)): + item = res[i] + try: + res[i] = bytes([int(item[:2], 16)]) + item[2:] + except ValueError: + res[i] = b'%' + item + return b''.join(res) + +def unquote(string, encoding='utf-8', errors='replace'): + """Replace %xx escapes by their single-character equivalent. The optional + encoding and errors parameters specify how to decode percent-encoded + sequences into Unicode characters, as accepted by the bytes.decode() + method. + By default, percent-encoded sequences are decoded with UTF-8, and invalid + sequences are replaced by a placeholder character. + + unquote('abc%20def') -> 'abc def'. + """ + if encoding is None: encoding = 'utf-8' + if errors is None: errors = 'replace' + # pct_sequence: contiguous sequence of percent-encoded bytes, decoded + # (list of single-byte bytes objects) + pct_sequence = [] + res = string.split('%') for i in range(1, len(res)): item = res[i] try: - res[i] = _hextochr[item[:2]] + item[2:] - except KeyError: - res[i] = '%' + item - except UnicodeDecodeError: - res[i] = chr(int(item[:2], 16)) + item[2:] - return "".join(res) - -def unquote_plus(s): - """unquote('%7e/abc+def') -> '~/abc def'""" - s = s.replace('+', ' ') - return unquote(s) - -always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' - 'abcdefghijklmnopqrstuvwxyz' - '0123456789' '_.-') + if not item: raise ValueError + pct_sequence.append(bytes.fromhex(item[:2])) + rest = item[2:] + except ValueError: + rest = '%' + item + if not rest: + # This segment was just a single percent-encoded character. + # May be part of a sequence of code units, so delay decoding. + # (Stored in pct_sequence). + res[i] = '' + else: + # Encountered non-percent-encoded characters. Flush the current + # pct_sequence. + res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest + pct_sequence = [] + if pct_sequence: + # Flush the final pct_sequence + # res[-1] will always be empty if pct_sequence != [] + assert not res[-1], "string=%r, res=%r" % (string, res) + res[-1] = b''.join(pct_sequence).decode(encoding, errors) + return ''.join(res) + +def unquote_plus(string, encoding='utf-8', errors='replace'): + """Like unquote(), but also replace plus signs by spaces, as required for + unquoting HTML form values. + + unquote_plus('%7e/abc+def') -> '~/abc def' + """ + string = string.replace('+', ' ') + return unquote(string, encoding, errors) + +_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + b'abcdefghijklmnopqrstuvwxyz' + b'0123456789' + b'_.-') _safe_quoters= {} -class Quoter: +class Quoter(collections.defaultdict): + """A mapping from bytes (in range(0,256)) to strings. + + String values are percent-encoded byte values, unless the key < 128, and + in the "safe" set (either the specified safe set, or default set). + """ + # Keeps a cache internally, using defaultdict, for efficiency (lookups + # of cached keys don't call Python code at all). def __init__(self, safe): - self.cache = {} - self.safe = safe + always_safe + """safe: bytes object.""" + self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128) - def __call__(self, c): - try: - return self.cache[c] - except KeyError: - if ord(c) < 256: - res = (c in self.safe) and c or ('%%%02X' % ord(c)) - self.cache[c] = res - return res - else: - return "".join(['%%%02X' % i for i in c.encode("utf-8")]) + def __repr__(self): + # Without this, will just display as a defaultdict + return "<Quoter %r>" % dict(self) + + def __missing__(self, b): + # Handle a cache miss. Store quoted string in cache and return. + res = b in self.safe and chr(b) or ('%%%02X' % b) + self[b] = res + return res -def quote(s, safe = '/'): +def quote(string, safe='/', encoding=None, errors=None): """quote('abc def') -> 'abc%20def' Each part of a URL, e.g. the path info, the query, etc., has a @@ -332,22 +386,57 @@ def quote(s, safe = '/'): is reserved, but in typical usage the quote function is being called on a path where the existing slash characters are used as reserved characters. + + string and safe may be either str or bytes objects. encoding must + not be specified if string is a str. + + The optional encoding and errors parameters specify how to deal with + non-ASCII characters, as accepted by the str.encode method. + By default, encoding='utf-8' (characters are encoded with UTF-8), and + errors='strict' (unsupported characters raise a UnicodeEncodeError). + """ + if isinstance(string, str): + if encoding is None: + encoding = 'utf-8' + if errors is None: + errors = 'strict' + string = string.encode(encoding, errors) + else: + if encoding is not None: + raise TypeError("quote() doesn't support 'encoding' for bytes") + if errors is not None: + raise TypeError("quote() doesn't support 'errors' for bytes") + return quote_from_bytes(string, safe) + +def quote_plus(string, safe='', encoding=None, errors=None): + """Like quote(), but also replace ' ' with '+', as required for quoting + HTML form values. Plus signs in the original string are escaped unless + they are included in safe. It also does not have safe default to '/'. """ - cachekey = (safe, always_safe) + # Check if ' ' in string, where string may either be a str or bytes + if ' ' in string if isinstance(string, str) else b' ' in string: + string = quote(string, + safe + ' ' if isinstance(safe, str) else safe + b' ') + return string.replace(' ', '+') + return quote(string, safe, encoding, errors) + +def quote_from_bytes(bs, safe='/'): + """Like quote(), but accepts a bytes object rather than a str, and does + not perform string-to-bytes encoding. It always returns an ASCII string. + quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB' + """ + if isinstance(safe, str): + # Normalize 'safe' by converting to bytes and removing non-ASCII chars + safe = safe.encode('ascii', 'ignore') + cachekey = bytes(safe) # In case it was a bytearray + if not (isinstance(bs, bytes) or isinstance(bs, bytearray)): + raise TypeError("quote_from_bytes() expected a bytes") try: quoter = _safe_quoters[cachekey] except KeyError: quoter = Quoter(safe) _safe_quoters[cachekey] = quoter - res = map(quoter, s) - return ''.join(res) - -def quote_plus(s, safe = ''): - """Quote the query fragment of a URL; replacing ' ' with '+'""" - if ' ' in s: - s = quote(s, safe + ' ') - return s.replace(' ', '+') - return quote(s, safe) + return ''.join(map(quoter.__getitem__, bs)) def urlencode(query,doseq=0): """Encode a sequence of two-element tuples or dictionary into a URL query string. |