diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2024-03-05 15:49:50 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-05 15:49:50 (GMT) |
commit | bdba8ef42b15e651dc23374a08143cc2b4c4657d (patch) | |
tree | 8af049682491f0bb9bd8c49b0f7886b5544fb9c4 | |
parent | f97f25ef5dfcdfec0d9a359fd970abd139cf3428 (diff) | |
download | cpython-bdba8ef42b15e651dc23374a08143cc2b4c4657d.zip cpython-bdba8ef42b15e651dc23374a08143cc2b4c4657d.tar.gz cpython-bdba8ef42b15e651dc23374a08143cc2b4c4657d.tar.bz2 |
gh-74668: Fix support of bytes in urllib.parse.parse_qsl() (GH-115771)
urllib.parse functions parse_qs() and parse_qsl() now support bytes
arguments containing raw and percent-encoded non-ASCII data.
-rw-r--r-- | Lib/test/test_urlparse.py | 37 | ||||
-rw-r--r-- | Lib/urllib/parse.py | 50 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Library/2024-02-21-17-54-59.gh-issue-74668.JT-Q8W.rst | 3 |
3 files changed, 64 insertions, 26 deletions
diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index 625c6dc..72f0286 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -19,6 +19,10 @@ parse_qsl_test_cases = [ ("=a", [('', 'a')]), ("a", [('a', '')]), ("a=", [('a', '')]), + ("a=b=c", [('a', 'b=c')]), + ("a%3Db=c", [('a=b', 'c')]), + ("a=b&c=d", [('a', 'b'), ('c', 'd')]), + ("a=b%26c=d", [('a', 'b&c=d')]), ("&a=b", [('a', 'b')]), ("a=a+b&b=b+c", [('a', 'a b'), ('b', 'b c')]), ("a=1&a=2", [('a', '1'), ('a', '2')]), @@ -29,6 +33,10 @@ parse_qsl_test_cases = [ (b"=a", [(b'', b'a')]), (b"a", [(b'a', b'')]), (b"a=", [(b'a', b'')]), + (b"a=b=c", [(b'a', b'b=c')]), + (b"a%3Db=c", [(b'a=b', b'c')]), + (b"a=b&c=d", [(b'a', b'b'), (b'c', b'd')]), + (b"a=b%26c=d", [(b'a', b'b&c=d')]), (b"&a=b", [(b'a', b'b')]), (b"a=a+b&b=b+c", [(b'a', b'a b'), (b'b', b'b c')]), (b"a=1&a=2", [(b'a', b'1'), (b'a', b'2')]), @@ -36,6 +44,14 @@ parse_qsl_test_cases = [ ("a=a+b;b=b+c", [('a', 'a b;b=b c')]), (b";a=b", [(b';a', b'b')]), (b"a=a+b;b=b+c", [(b'a', b'a b;b=b c')]), + + ("\u0141=\xE9", [('\u0141', '\xE9')]), + ("%C5%81=%C3%A9", [('\u0141', '\xE9')]), + ("%81=%A9", [('\ufffd', '\ufffd')]), + (b"\xc5\x81=\xc3\xa9", [(b'\xc5\x81', b'\xc3\xa9')]), + (b"%C5%81=%C3%A9", [(b'\xc5\x81', b'\xc3\xa9')]), + (b"\x81=\xA9", [(b'\x81', b'\xa9')]), + (b"%81=%A9", [(b'\x81', b'\xa9')]), ] # Each parse_qs testcase is a two-tuple that contains @@ -49,6 +65,10 @@ parse_qs_test_cases = [ ("=a", {'': ['a']}), ("a", {'a': ['']}), ("a=", {'a': ['']}), + ("a=b=c", {'a': ['b=c']}), + ("a%3Db=c", {'a=b': ['c']}), + ("a=b&c=d", {'a': ['b'], 'c': ['d']}), + ("a=b%26c=d", {'a': ['b&c=d']}), ("&a=b", {'a': ['b']}), ("a=a+b&b=b+c", {'a': ['a b'], 'b': ['b c']}), ("a=1&a=2", {'a': ['1', '2']}), @@ -59,6 +79,10 @@ parse_qs_test_cases = [ (b"=a", {b'': [b'a']}), (b"a", {b'a': [b'']}), (b"a=", {b'a': [b'']}), + (b"a=b=c", {b'a': [b'b=c']}), + (b"a%3Db=c", {b'a=b': [b'c']}), + (b"a=b&c=d", {b'a': [b'b'], b'c': [b'd']}), + (b"a=b%26c=d", {b'a': [b'b&c=d']}), (b"&a=b", {b'a': [b'b']}), (b"a=a+b&b=b+c", {b'a': [b'a b'], b'b': [b'b c']}), (b"a=1&a=2", {b'a': [b'1', b'2']}), @@ -66,6 +90,15 @@ parse_qs_test_cases = [ ("a=a+b;b=b+c", {'a': ['a b;b=b c']}), (b";a=b", {b';a': [b'b']}), (b"a=a+b;b=b+c", {b'a':[ b'a b;b=b c']}), + (b"a=a%E2%80%99b", {b'a': [b'a\xe2\x80\x99b']}), + + ("\u0141=\xE9", {'\u0141': ['\xE9']}), + ("%C5%81=%C3%A9", {'\u0141': ['\xE9']}), + ("%81=%A9", {'\ufffd': ['\ufffd']}), + (b"\xc5\x81=\xc3\xa9", {b'\xc5\x81': [b'\xc3\xa9']}), + (b"%C5%81=%C3%A9", {b'\xc5\x81': [b'\xc3\xa9']}), + (b"\x81=\xA9", {b'\x81': [b'\xa9']}), + (b"%81=%A9", {b'\x81': [b'\xa9']}), ] class UrlParseTestCase(unittest.TestCase): @@ -995,8 +1028,8 @@ class UrlParseTestCase(unittest.TestCase): def test_parse_qsl_max_num_fields(self): with self.assertRaises(ValueError): - urllib.parse.parse_qs('&'.join(['a=a']*11), max_num_fields=10) - urllib.parse.parse_qs('&'.join(['a=a']*10), max_num_fields=10) + urllib.parse.parse_qsl('&'.join(['a=a']*11), max_num_fields=10) + urllib.parse.parse_qsl('&'.join(['a=a']*10), max_num_fields=10) def test_parse_qs_separator(self): parse_qs_semicolon_cases = [ diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index c129b0d..ec52821 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -763,42 +763,44 @@ def parse_qsl(qs, keep_blank_values=False, strict_parsing=False, Returns a list, as G-d intended. """ - qs, _coerce_result = _coerce_args(qs) - separator, _ = _coerce_args(separator) - if not separator or (not isinstance(separator, (str, bytes))): + if not separator or not isinstance(separator, (str, bytes)): raise ValueError("Separator must be of type string or bytes.") + if isinstance(qs, str): + if not isinstance(separator, str): + separator = str(separator, 'ascii') + eq = '=' + def _unquote(s): + return unquote_plus(s, encoding=encoding, errors=errors) + else: + qs = bytes(qs) + if isinstance(separator, str): + separator = bytes(separator, 'ascii') + eq = b'=' + def _unquote(s): + return unquote_to_bytes(s.replace(b'+', b' ')) + + if not qs: + return [] # If max_num_fields is defined then check that the number of fields # is less than max_num_fields. This prevents a memory exhaustion DOS # attack via post bodies with many fields. if max_num_fields is not None: - num_fields = 1 + qs.count(separator) if qs else 0 + num_fields = 1 + qs.count(separator) if max_num_fields < num_fields: raise ValueError('Max number of fields exceeded') r = [] - query_args = qs.split(separator) if qs else [] - for name_value in query_args: - if not name_value and not strict_parsing: - continue - nv = name_value.split('=', 1) - if len(nv) != 2: - if strict_parsing: + for name_value in qs.split(separator): + if name_value or strict_parsing: + name, has_eq, value = name_value.partition(eq) + if not has_eq and strict_parsing: raise ValueError("bad query field: %r" % (name_value,)) - # Handle case of a control-name with no equal sign - if keep_blank_values: - nv.append('') - else: - continue - if len(nv[1]) or keep_blank_values: - name = nv[0].replace('+', ' ') - name = unquote(name, encoding=encoding, errors=errors) - name = _coerce_result(name) - value = nv[1].replace('+', ' ') - value = unquote(value, encoding=encoding, errors=errors) - value = _coerce_result(value) - r.append((name, value)) + if value or keep_blank_values: + name = _unquote(name) + value = _unquote(value) + r.append((name, value)) return r def unquote_plus(string, encoding='utf-8', errors='replace'): diff --git a/Misc/NEWS.d/next/Library/2024-02-21-17-54-59.gh-issue-74668.JT-Q8W.rst b/Misc/NEWS.d/next/Library/2024-02-21-17-54-59.gh-issue-74668.JT-Q8W.rst new file mode 100644 index 0000000..f4a6e6d --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-02-21-17-54-59.gh-issue-74668.JT-Q8W.rst @@ -0,0 +1,3 @@ +:mod:`urllib.parse` functions :func:`~urllib.parse.parse_qs` and +:func:`~urllib.parse.parse_qsl` now support bytes arguments containing raw +and percent-encoded non-ASCII data. |