summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
Diffstat (limited to 'Lib')
-rw-r--r--Lib/email/utils.py7
-rw-r--r--Lib/test/test_cgi.py2
-rw-r--r--Lib/test/test_http_cookiejar.py5
-rw-r--r--Lib/test/test_urllib.py249
-rwxr-xr-xLib/test/test_wsgiref.py2
-rw-r--r--Lib/urllib/parse.py183
6 files changed, 376 insertions, 72 deletions
diff --git a/Lib/email/utils.py b/Lib/email/utils.py
index e1d21f6..35275f6 100644
--- a/Lib/email/utils.py
+++ b/Lib/email/utils.py
@@ -219,7 +219,7 @@ def encode_rfc2231(s, charset=None, language=None):
charset is given but not language, the string is encoded using the empty
string for language.
"""
- s = urllib.parse.quote(s, safe='')
+ s = urllib.parse.quote(s, safe='', encoding=charset or 'ascii')
if charset is None and language is None:
return s
if language is None:
@@ -271,7 +271,10 @@ def decode_params(params):
# language specifiers at the beginning of the string.
for num, s, encoded in continuations:
if encoded:
- s = urllib.parse.unquote(s)
+ # Decode as "latin-1", so the characters in s directly
+ # represent the percent-encoded octet values.
+ # collapse_rfc2231_value treats this as an octet sequence.
+ s = urllib.parse.unquote(s, encoding="latin-1")
extended = True
value.append(s)
value = quote(EMPTYSTRING.join(value))
diff --git a/Lib/test/test_cgi.py b/Lib/test/test_cgi.py
index 9381a3c..cc11acc 100644
--- a/Lib/test/test_cgi.py
+++ b/Lib/test/test_cgi.py
@@ -68,6 +68,8 @@ parse_qsl_test_cases = [
("&a=b", [('a', 'b')]),
("a=a+b&b=b+c", [('a', 'a b'), ('b', 'b c')]),
("a=1&a=2", [('a', '1'), ('a', '2')]),
+ ("a=%26&b=%3D", [('a', '&'), ('b', '=')]),
+ ("a=%C3%BC&b=%CA%83", [('a', '\xfc'), ('b', '\u0283')]),
]
parse_strict_test_cases = [
diff --git a/Lib/test/test_http_cookiejar.py b/Lib/test/test_http_cookiejar.py
index 1627923..a97c6fa 100644
--- a/Lib/test/test_http_cookiejar.py
+++ b/Lib/test/test_http_cookiejar.py
@@ -539,6 +539,8 @@ class CookieTests(TestCase):
# unquoted unsafe
("/foo\031/bar", "/foo%19/bar"),
("/\175foo/bar", "/%7Dfoo/bar"),
+ # unicode, latin-1 range
+ ("/foo/bar\u00fc", "/foo/bar%C3%BC"), # UTF-8 encoded
# unicode
("/foo/bar\uabcd", "/foo/bar%EA%AF%8D"), # UTF-8 encoded
]
@@ -1444,7 +1446,8 @@ class LWPCookieTests(TestCase):
# Try some URL encodings of the PATHs.
# (the behaviour here has changed from libwww-perl)
c = CookieJar(DefaultCookiePolicy(rfc2965=True))
- interact_2965(c, "http://www.acme.com/foo%2f%25/%3c%3c%0Anew%E5/%E5",
+ interact_2965(c, "http://www.acme.com/foo%2f%25/"
+ "%3c%3c%0Anew%C3%A5/%C3%A5",
"foo = bar; version = 1")
cookie = interact_2965(
diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py
index f5a9d5d..a8a8111 100644
--- a/Lib/test/test_urllib.py
+++ b/Lib/test/test_urllib.py
@@ -336,10 +336,10 @@ class QuotingTests(unittest.TestCase):
"_.-"])
result = urllib.parse.quote(do_not_quote)
self.assertEqual(do_not_quote, result,
- "using quote(): %s != %s" % (do_not_quote, result))
+ "using quote(): %r != %r" % (do_not_quote, result))
result = urllib.parse.quote_plus(do_not_quote)
self.assertEqual(do_not_quote, result,
- "using quote_plus(): %s != %s" % (do_not_quote, result))
+ "using quote_plus(): %r != %r" % (do_not_quote, result))
def test_default_safe(self):
# Test '/' is default value for 'safe' parameter
@@ -350,11 +350,28 @@ class QuotingTests(unittest.TestCase):
quote_by_default = "<>"
result = urllib.parse.quote(quote_by_default, safe=quote_by_default)
self.assertEqual(quote_by_default, result,
- "using quote(): %s != %s" % (quote_by_default, result))
+ "using quote(): %r != %r" % (quote_by_default, result))
result = urllib.parse.quote_plus(quote_by_default, safe=quote_by_default)
self.assertEqual(quote_by_default, result,
- "using quote_plus(): %s != %s" %
+ "using quote_plus(): %r != %r" %
(quote_by_default, result))
+ # Safe expressed as bytes rather than str
+ result = urllib.parse.quote(quote_by_default, safe=b"<>")
+ self.assertEqual(quote_by_default, result,
+ "using quote(): %r != %r" % (quote_by_default, result))
+ # "Safe" non-ASCII characters should have no effect
+ # (Since URIs are not allowed to have non-ASCII characters)
+ result = urllib.parse.quote("a\xfcb", encoding="latin-1", safe="\xfc")
+ expect = urllib.parse.quote("a\xfcb", encoding="latin-1", safe="")
+ self.assertEqual(expect, result,
+ "using quote(): %r != %r" %
+ (expect, result))
+ # Same as above, but using a bytes rather than str
+ result = urllib.parse.quote("a\xfcb", encoding="latin-1", safe=b"\xfc")
+ expect = urllib.parse.quote("a\xfcb", encoding="latin-1", safe="")
+ self.assertEqual(expect, result,
+ "using quote(): %r != %r" %
+ (expect, result))
def test_default_quoting(self):
# Make sure all characters that should be quoted are by default sans
@@ -378,34 +395,98 @@ class QuotingTests(unittest.TestCase):
expected = "ab%5B%5Dcd"
result = urllib.parse.quote(partial_quote)
self.assertEqual(expected, result,
- "using quote(): %s != %s" % (expected, result))
+ "using quote(): %r != %r" % (expected, result))
self.assertEqual(expected, result,
- "using quote_plus(): %s != %s" % (expected, result))
+ "using quote_plus(): %r != %r" % (expected, result))
def test_quoting_space(self):
# Make sure quote() and quote_plus() handle spaces as specified in
# their unique way
result = urllib.parse.quote(' ')
self.assertEqual(result, hexescape(' '),
- "using quote(): %s != %s" % (result, hexescape(' ')))
+ "using quote(): %r != %r" % (result, hexescape(' ')))
result = urllib.parse.quote_plus(' ')
self.assertEqual(result, '+',
- "using quote_plus(): %s != +" % result)
+ "using quote_plus(): %r != +" % result)
given = "a b cd e f"
expect = given.replace(' ', hexescape(' '))
result = urllib.parse.quote(given)
self.assertEqual(expect, result,
- "using quote(): %s != %s" % (expect, result))
+ "using quote(): %r != %r" % (expect, result))
expect = given.replace(' ', '+')
result = urllib.parse.quote_plus(given)
self.assertEqual(expect, result,
- "using quote_plus(): %s != %s" % (expect, result))
+ "using quote_plus(): %r != %r" % (expect, result))
def test_quoting_plus(self):
self.assertEqual(urllib.parse.quote_plus('alpha+beta gamma'),
'alpha%2Bbeta+gamma')
self.assertEqual(urllib.parse.quote_plus('alpha+beta gamma', '+'),
'alpha+beta+gamma')
+ # Test with bytes
+ self.assertEqual(urllib.parse.quote_plus(b'alpha+beta gamma'),
+ 'alpha%2Bbeta+gamma')
+ # Test with safe bytes
+ self.assertEqual(urllib.parse.quote_plus('alpha+beta gamma', b'+'),
+ 'alpha+beta+gamma')
+
+ def test_quote_bytes(self):
+ # Bytes should quote directly to percent-encoded values
+ given = b"\xa2\xd8ab\xff"
+ expect = "%A2%D8ab%FF"
+ result = urllib.parse.quote(given)
+ self.assertEqual(expect, result,
+ "using quote(): %r != %r" % (expect, result))
+ # Encoding argument should raise type error on bytes input
+ self.assertRaises(TypeError, urllib.parse.quote, given,
+ encoding="latin-1")
+ # quote_from_bytes should work the same
+ result = urllib.parse.quote_from_bytes(given)
+ self.assertEqual(expect, result,
+ "using quote_from_bytes(): %r != %r"
+ % (expect, result))
+
+ def test_quote_with_unicode(self):
+ # Characters in Latin-1 range, encoded by default in UTF-8
+ given = "\xa2\xd8ab\xff"
+ expect = "%C2%A2%C3%98ab%C3%BF"
+ result = urllib.parse.quote(given)
+ self.assertEqual(expect, result,
+ "using quote(): %r != %r" % (expect, result))
+ # Characters in Latin-1 range, encoded by with None (default)
+ result = urllib.parse.quote(given, encoding=None, errors=None)
+ self.assertEqual(expect, result,
+ "using quote(): %r != %r" % (expect, result))
+ # Characters in Latin-1 range, encoded with Latin-1
+ given = "\xa2\xd8ab\xff"
+ expect = "%A2%D8ab%FF"
+ result = urllib.parse.quote(given, encoding="latin-1")
+ self.assertEqual(expect, result,
+ "using quote(): %r != %r" % (expect, result))
+ # Characters in BMP, encoded by default in UTF-8
+ given = "\u6f22\u5b57" # "Kanji"
+ expect = "%E6%BC%A2%E5%AD%97"
+ result = urllib.parse.quote(given)
+ self.assertEqual(expect, result,
+ "using quote(): %r != %r" % (expect, result))
+ # Characters in BMP, encoded with Latin-1
+ given = "\u6f22\u5b57"
+ self.assertRaises(UnicodeEncodeError, urllib.parse.quote, given,
+ encoding="latin-1")
+ # Characters in BMP, encoded with Latin-1, with replace error handling
+ given = "\u6f22\u5b57"
+ expect = "%3F%3F" # "??"
+ result = urllib.parse.quote(given, encoding="latin-1",
+ errors="replace")
+ self.assertEqual(expect, result,
+ "using quote(): %r != %r" % (expect, result))
+ # Characters in BMP, Latin-1, with xmlcharref error handling
+ given = "\u6f22\u5b57"
+ expect = "%26%2328450%3B%26%2323383%3B" # "&#28450;&#23383;"
+ result = urllib.parse.quote(given, encoding="latin-1",
+ errors="xmlcharrefreplace")
+ self.assertEqual(expect, result,
+ "using quote(): %r != %r" % (expect, result))
class UnquotingTests(unittest.TestCase):
"""Tests for unquote() and unquote_plus()
@@ -422,23 +503,62 @@ class UnquotingTests(unittest.TestCase):
expect = chr(num)
result = urllib.parse.unquote(given)
self.assertEqual(expect, result,
- "using unquote(): %s != %s" % (expect, result))
+ "using unquote(): %r != %r" % (expect, result))
result = urllib.parse.unquote_plus(given)
self.assertEqual(expect, result,
- "using unquote_plus(): %s != %s" %
+ "using unquote_plus(): %r != %r" %
(expect, result))
escape_list.append(given)
escape_string = ''.join(escape_list)
del escape_list
result = urllib.parse.unquote(escape_string)
self.assertEqual(result.count('%'), 1,
- "using quote(): not all characters escaped; %s" %
- result)
- result = urllib.parse.unquote(escape_string)
- self.assertEqual(result.count('%'), 1,
"using unquote(): not all characters escaped: "
"%s" % result)
+ def test_unquoting_badpercent(self):
+ # Test unquoting on bad percent-escapes
+ given = '%xab'
+ expect = given
+ result = urllib.parse.unquote(given)
+ self.assertEqual(expect, result, "using unquote(): %r != %r"
+ % (expect, result))
+ given = '%x'
+ expect = given
+ result = urllib.parse.unquote(given)
+ self.assertEqual(expect, result, "using unquote(): %r != %r"
+ % (expect, result))
+ given = '%'
+ expect = given
+ result = urllib.parse.unquote(given)
+ self.assertEqual(expect, result, "using unquote(): %r != %r"
+ % (expect, result))
+ # unquote_to_bytes
+ given = '%xab'
+ expect = bytes(given, 'ascii')
+ result = urllib.parse.unquote_to_bytes(given)
+ self.assertEqual(expect, result, "using unquote_to_bytes(): %r != %r"
+ % (expect, result))
+ given = '%x'
+ expect = bytes(given, 'ascii')
+ result = urllib.parse.unquote_to_bytes(given)
+ self.assertEqual(expect, result, "using unquote_to_bytes(): %r != %r"
+ % (expect, result))
+ given = '%'
+ expect = bytes(given, 'ascii')
+ result = urllib.parse.unquote_to_bytes(given)
+ self.assertEqual(expect, result, "using unquote_to_bytes(): %r != %r"
+ % (expect, result))
+
+ def test_unquoting_mixed_case(self):
+ # Test unquoting on mixed-case hex digits in the percent-escapes
+ given = '%Ab%eA'
+ expect = b'\xab\xea'
+ result = urllib.parse.unquote_to_bytes(given)
+ self.assertEqual(expect, result,
+ "using unquote_to_bytes(): %r != %r"
+ % (expect, result))
+
def test_unquoting_parts(self):
# Make sure unquoting works when have non-quoted characters
# interspersed
@@ -446,10 +566,10 @@ class UnquotingTests(unittest.TestCase):
expect = "abcd"
result = urllib.parse.unquote(given)
self.assertEqual(expect, result,
- "using quote(): %s != %s" % (expect, result))
+ "using quote(): %r != %r" % (expect, result))
result = urllib.parse.unquote_plus(given)
self.assertEqual(expect, result,
- "using unquote_plus(): %s != %s" % (expect, result))
+ "using unquote_plus(): %r != %r" % (expect, result))
def test_unquoting_plus(self):
# Test difference between unquote() and unquote_plus()
@@ -457,15 +577,100 @@ class UnquotingTests(unittest.TestCase):
expect = given
result = urllib.parse.unquote(given)
self.assertEqual(expect, result,
- "using unquote(): %s != %s" % (expect, result))
+ "using unquote(): %r != %r" % (expect, result))
expect = given.replace('+', ' ')
result = urllib.parse.unquote_plus(given)
self.assertEqual(expect, result,
- "using unquote_plus(): %s != %s" % (expect, result))
+ "using unquote_plus(): %r != %r" % (expect, result))
+
+ def test_unquote_to_bytes(self):
+ given = 'br%C3%BCckner_sapporo_20050930.doc'
+ expect = b'br\xc3\xbcckner_sapporo_20050930.doc'
+ result = urllib.parse.unquote_to_bytes(given)
+ self.assertEqual(expect, result,
+ "using unquote_to_bytes(): %r != %r"
+ % (expect, result))
+ # Test on a string with unescaped non-ASCII characters
+ # (Technically an invalid URI; expect those characters to be UTF-8
+ # encoded).
+ result = urllib.parse.unquote_to_bytes("\u6f22%C3%BC")
+ expect = b'\xe6\xbc\xa2\xc3\xbc' # UTF-8 for "\u6f22\u00fc"
+ self.assertEqual(expect, result,
+ "using unquote_to_bytes(): %r != %r"
+ % (expect, result))
+ # Test with a bytes as input
+ given = b'%A2%D8ab%FF'
+ expect = b'\xa2\xd8ab\xff'
+ result = urllib.parse.unquote_to_bytes(given)
+ self.assertEqual(expect, result,
+ "using unquote_to_bytes(): %r != %r"
+ % (expect, result))
+ # Test with a bytes as input, with unescaped non-ASCII bytes
+ # (Technically an invalid URI; expect those bytes to be preserved)
+ given = b'%A2\xd8ab%FF'
+ expect = b'\xa2\xd8ab\xff'
+ result = urllib.parse.unquote_to_bytes(given)
+ self.assertEqual(expect, result,
+ "using unquote_to_bytes(): %r != %r"
+ % (expect, result))
def test_unquote_with_unicode(self):
- r = urllib.parse.unquote('br%C3%BCckner_sapporo_20050930.doc')
- self.assertEqual(r, 'br\xc3\xbcckner_sapporo_20050930.doc')
+ # Characters in the Latin-1 range, encoded with UTF-8
+ given = 'br%C3%BCckner_sapporo_20050930.doc'
+ expect = 'br\u00fcckner_sapporo_20050930.doc'
+ result = urllib.parse.unquote(given)
+ self.assertEqual(expect, result,
+ "using unquote(): %r != %r" % (expect, result))
+ # Characters in the Latin-1 range, encoded with None (default)
+ result = urllib.parse.unquote(given, encoding=None, errors=None)
+ self.assertEqual(expect, result,
+ "using unquote(): %r != %r" % (expect, result))
+
+ # Characters in the Latin-1 range, encoded with Latin-1
+ result = urllib.parse.unquote('br%FCckner_sapporo_20050930.doc',
+ encoding="latin-1")
+ expect = 'br\u00fcckner_sapporo_20050930.doc'
+ self.assertEqual(expect, result,
+ "using unquote(): %r != %r" % (expect, result))
+
+ # Characters in BMP, encoded with UTF-8
+ given = "%E6%BC%A2%E5%AD%97"
+ expect = "\u6f22\u5b57" # "Kanji"
+ result = urllib.parse.unquote(given)
+ self.assertEqual(expect, result,
+ "using unquote(): %r != %r" % (expect, result))
+
+ # Decode with UTF-8, invalid sequence
+ given = "%F3%B1"
+ expect = "\ufffd" # Replacement character
+ result = urllib.parse.unquote(given)
+ self.assertEqual(expect, result,
+ "using unquote(): %r != %r" % (expect, result))
+
+ # Decode with UTF-8, invalid sequence, replace errors
+ result = urllib.parse.unquote(given, errors="replace")
+ self.assertEqual(expect, result,
+ "using unquote(): %r != %r" % (expect, result))
+
+ # Decode with UTF-8, invalid sequence, ignoring errors
+ given = "%F3%B1"
+ expect = ""
+ result = urllib.parse.unquote(given, errors="ignore")
+ self.assertEqual(expect, result,
+ "using unquote(): %r != %r" % (expect, result))
+
+ # A mix of non-ASCII and percent-encoded characters, UTF-8
+ result = urllib.parse.unquote("\u6f22%C3%BC")
+ expect = '\u6f22\u00fc'
+ self.assertEqual(expect, result,
+ "using unquote(): %r != %r" % (expect, result))
+
+ # A mix of non-ASCII and percent-encoded characters, Latin-1
+ # (Note, the string contains non-Latin-1-representable characters)
+ result = urllib.parse.unquote("\u6f22%FC", encoding="latin-1")
+ expect = '\u6f22\u00fc'
+ self.assertEqual(expect, result,
+ "using unquote(): %r != %r" % (expect, result))
class urlencode_Tests(unittest.TestCase):
"""Tests for urlencode()"""
diff --git a/Lib/test/test_wsgiref.py b/Lib/test/test_wsgiref.py
index ac5ada3..b98452d 100755
--- a/Lib/test/test_wsgiref.py
+++ b/Lib/test/test_wsgiref.py
@@ -291,6 +291,7 @@ class UtilityTests(TestCase):
def testAppURIs(self):
self.checkAppURI("http://127.0.0.1/")
self.checkAppURI("http://127.0.0.1/spam", SCRIPT_NAME="/spam")
+ self.checkAppURI("http://127.0.0.1/sp%C3%A4m", SCRIPT_NAME="/späm")
self.checkAppURI("http://spam.example.com:2071/",
HTTP_HOST="spam.example.com:2071", SERVER_PORT="2071")
self.checkAppURI("http://spam.example.com/",
@@ -304,6 +305,7 @@ class UtilityTests(TestCase):
def testReqURIs(self):
self.checkReqURI("http://127.0.0.1/")
self.checkReqURI("http://127.0.0.1/spam", SCRIPT_NAME="/spam")
+ self.checkReqURI("http://127.0.0.1/sp%C3%A4m", SCRIPT_NAME="/späm")
self.checkReqURI("http://127.0.0.1/spammity/spam",
SCRIPT_NAME="/spammity", PATH_INFO="/spam")
self.checkReqURI("http://127.0.0.1/spammity/spam?say=ni",
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
index 3e00695..94d77eb 100644
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -5,9 +5,12 @@ UC Irvine, June 1995.
"""
import sys
+import collections
__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
- "urlsplit", "urlunsplit"]
+ "urlsplit", "urlunsplit",
+ "quote", "quote_plus", "quote_from_bytes",
+ "unquote", "unquote_plus", "unquote_to_bytes"]
# A classification of schemes ('' means apply by default)
uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
@@ -269,50 +272,101 @@ def urldefrag(url):
else:
return url, ''
-
-_hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
-_hextochr.update(('%02X' % i, chr(i)) for i in range(256))
-
-def unquote(s):
- """unquote('abc%20def') -> 'abc def'."""
- res = s.split('%')
+def unquote_to_bytes(string):
+ """unquote_to_bytes('abc%20def') -> b'abc def'."""
+ # Note: strings are encoded as UTF-8. This is only an issue if it contains
+ # unescaped non-ASCII characters, which URIs should not.
+ if isinstance(string, str):
+ string = string.encode('utf-8')
+ res = string.split(b'%')
+ res[0] = res[0]
+ for i in range(1, len(res)):
+ item = res[i]
+ try:
+ res[i] = bytes([int(item[:2], 16)]) + item[2:]
+ except ValueError:
+ res[i] = b'%' + item
+ return b''.join(res)
+
+def unquote(string, encoding='utf-8', errors='replace'):
+ """Replace %xx escapes by their single-character equivalent. The optional
+ encoding and errors parameters specify how to decode percent-encoded
+ sequences into Unicode characters, as accepted by the bytes.decode()
+ method.
+ By default, percent-encoded sequences are decoded with UTF-8, and invalid
+ sequences are replaced by a placeholder character.
+
+ unquote('abc%20def') -> 'abc def'.
+ """
+ if encoding is None: encoding = 'utf-8'
+ if errors is None: errors = 'replace'
+ # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
+ # (list of single-byte bytes objects)
+ pct_sequence = []
+ res = string.split('%')
for i in range(1, len(res)):
item = res[i]
try:
- res[i] = _hextochr[item[:2]] + item[2:]
- except KeyError:
- res[i] = '%' + item
- except UnicodeDecodeError:
- res[i] = chr(int(item[:2], 16)) + item[2:]
- return "".join(res)
-
-def unquote_plus(s):
- """unquote('%7e/abc+def') -> '~/abc def'"""
- s = s.replace('+', ' ')
- return unquote(s)
-
-always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
- 'abcdefghijklmnopqrstuvwxyz'
- '0123456789' '_.-')
+ if not item: raise ValueError
+ pct_sequence.append(bytes.fromhex(item[:2]))
+ rest = item[2:]
+ except ValueError:
+ rest = '%' + item
+ if not rest:
+ # This segment was just a single percent-encoded character.
+ # May be part of a sequence of code units, so delay decoding.
+ # (Stored in pct_sequence).
+ res[i] = ''
+ else:
+ # Encountered non-percent-encoded characters. Flush the current
+ # pct_sequence.
+ res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
+ pct_sequence = []
+ if pct_sequence:
+ # Flush the final pct_sequence
+ # res[-1] will always be empty if pct_sequence != []
+ assert not res[-1], "string=%r, res=%r" % (string, res)
+ res[-1] = b''.join(pct_sequence).decode(encoding, errors)
+ return ''.join(res)
+
+def unquote_plus(string, encoding='utf-8', errors='replace'):
+ """Like unquote(), but also replace plus signs by spaces, as required for
+ unquoting HTML form values.
+
+ unquote_plus('%7e/abc+def') -> '~/abc def'
+ """
+ string = string.replace('+', ' ')
+ return unquote(string, encoding, errors)
+
+_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+ b'abcdefghijklmnopqrstuvwxyz'
+ b'0123456789'
+ b'_.-')
_safe_quoters= {}
-class Quoter:
+class Quoter(collections.defaultdict):
+ """A mapping from bytes (in range(0,256)) to strings.
+
+ String values are percent-encoded byte values, unless the key < 128, and
+ in the "safe" set (either the specified safe set, or default set).
+ """
+ # Keeps a cache internally, using defaultdict, for efficiency (lookups
+ # of cached keys don't call Python code at all).
def __init__(self, safe):
- self.cache = {}
- self.safe = safe + always_safe
+ """safe: bytes object."""
+ self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
- def __call__(self, c):
- try:
- return self.cache[c]
- except KeyError:
- if ord(c) < 256:
- res = (c in self.safe) and c or ('%%%02X' % ord(c))
- self.cache[c] = res
- return res
- else:
- return "".join(['%%%02X' % i for i in c.encode("utf-8")])
+ def __repr__(self):
+ # Without this, will just display as a defaultdict
+ return "<Quoter %r>" % dict(self)
+
+ def __missing__(self, b):
+ # Handle a cache miss. Store quoted string in cache and return.
+ res = b in self.safe and chr(b) or ('%%%02X' % b)
+ self[b] = res
+ return res
-def quote(s, safe = '/'):
+def quote(string, safe='/', encoding=None, errors=None):
"""quote('abc def') -> 'abc%20def'
Each part of a URL, e.g. the path info, the query, etc., has a
@@ -332,22 +386,57 @@ def quote(s, safe = '/'):
is reserved, but in typical usage the quote function is being
called on a path where the existing slash characters are used as
reserved characters.
+
+ string and safe may be either str or bytes objects. encoding must
+ not be specified if string is a str.
+
+ The optional encoding and errors parameters specify how to deal with
+ non-ASCII characters, as accepted by the str.encode method.
+ By default, encoding='utf-8' (characters are encoded with UTF-8), and
+ errors='strict' (unsupported characters raise a UnicodeEncodeError).
+ """
+ if isinstance(string, str):
+ if encoding is None:
+ encoding = 'utf-8'
+ if errors is None:
+ errors = 'strict'
+ string = string.encode(encoding, errors)
+ else:
+ if encoding is not None:
+ raise TypeError("quote() doesn't support 'encoding' for bytes")
+ if errors is not None:
+ raise TypeError("quote() doesn't support 'errors' for bytes")
+ return quote_from_bytes(string, safe)
+
+def quote_plus(string, safe='', encoding=None, errors=None):
+ """Like quote(), but also replace ' ' with '+', as required for quoting
+ HTML form values. Plus signs in the original string are escaped unless
+ they are included in safe. It also does not have safe default to '/'.
"""
- cachekey = (safe, always_safe)
+ # Check if ' ' in string, where string may either be a str or bytes
+ if ' ' in string if isinstance(string, str) else b' ' in string:
+ string = quote(string,
+ safe + ' ' if isinstance(safe, str) else safe + b' ')
+ return string.replace(' ', '+')
+ return quote(string, safe, encoding, errors)
+
+def quote_from_bytes(bs, safe='/'):
+ """Like quote(), but accepts a bytes object rather than a str, and does
+ not perform string-to-bytes encoding. It always returns an ASCII string.
+ quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB'
+ """
+ if isinstance(safe, str):
+ # Normalize 'safe' by converting to bytes and removing non-ASCII chars
+ safe = safe.encode('ascii', 'ignore')
+ cachekey = bytes(safe) # In case it was a bytearray
+ if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
+ raise TypeError("quote_from_bytes() expected a bytes")
try:
quoter = _safe_quoters[cachekey]
except KeyError:
quoter = Quoter(safe)
_safe_quoters[cachekey] = quoter
- res = map(quoter, s)
- return ''.join(res)
-
-def quote_plus(s, safe = ''):
- """Quote the query fragment of a URL; replacing ' ' with '+'"""
- if ' ' in s:
- s = quote(s, safe + ' ')
- return s.replace(' ', '+')
- return quote(s, safe)
+ return ''.join(map(quoter.__getitem__, bs))
def urlencode(query,doseq=0):
"""Encode a sequence of two-element tuples or dictionary into a URL query string.