diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2013-11-26 19:27:11 (GMT) |
---|---|---|
committer | Serhiy Storchaka <storchaka@gmail.com> | 2013-11-26 19:27:11 (GMT) |
commit | 687ff0ecdf9eb574c3553eee2a8492668cfa84ef (patch) | |
tree | 19e87329763348558f5e0a92b3e396f078dd6b1a | |
parent | 1df88677e96f258a917b1cec0940ea98aeccaa72 (diff) | |
parent | c93329b3dd6dde3de76f473f5573233cb0366d9c (diff) | |
download | cpython-687ff0ecdf9eb574c3553eee2a8492668cfa84ef.zip cpython-687ff0ecdf9eb574c3553eee2a8492668cfa84ef.tar.gz cpython-687ff0ecdf9eb574c3553eee2a8492668cfa84ef.tar.bz2 |
Issue #11489: JSON decoder now accepts lone surrogates.
-rw-r--r-- | Lib/json/decoder.py | 35 | ||||
-rw-r--r-- | Lib/test/test_json/test_scanstring.py | 51 | ||||
-rw-r--r-- | Misc/NEWS | 2 | ||||
-rw-r--r-- | Modules/_json.c | 26 |
4 files changed, 73 insertions, 41 deletions
diff --git a/Lib/json/decoder.py b/Lib/json/decoder.py index da7ef9c..59e5f41 100644 --- a/Lib/json/decoder.py +++ b/Lib/json/decoder.py @@ -58,6 +58,16 @@ BACKSLASH = { 'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t', } +def _decode_uXXXX(s, pos): + esc = s[pos + 1:pos + 5] + if len(esc) == 4 and esc[1] not in 'xX': + try: + return int(esc, 16) + except ValueError: + pass + msg = "Invalid \\uXXXX escape" + raise ValueError(errmsg(msg, s, pos)) + def py_scanstring(s, end, strict=True, _b=BACKSLASH, _m=STRINGCHUNK.match): """Scan the string s for a JSON string. End is the index of the @@ -107,25 +117,14 @@ def py_scanstring(s, end, strict=True, raise ValueError(errmsg(msg, s, end)) end += 1 else: - esc = s[end + 1:end + 5] - next_end = end + 5 - if len(esc) != 4: - msg = "Invalid \\uXXXX escape" - raise ValueError(errmsg(msg, s, end)) - uni = int(esc, 16) - if 0xd800 <= uni <= 0xdbff: - msg = "Invalid \\uXXXX\\uXXXX surrogate pair" - if not s[end + 5:end + 7] == '\\u': - raise ValueError(errmsg(msg, s, end)) - esc2 = s[end + 7:end + 11] - if len(esc2) != 4: - raise ValueError(errmsg(msg, s, end)) - uni2 = int(esc2, 16) - uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) - next_end += 6 + uni = _decode_uXXXX(s, end) + end += 5 + if 0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u': + uni2 = _decode_uXXXX(s, end + 1) + if 0xdc00 <= uni2 <= 0xdfff: + uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) + end += 6 char = chr(uni) - - end = next_end _append(char) return ''.join(chunks), end diff --git a/Lib/test/test_json/test_scanstring.py b/Lib/test/test_json/test_scanstring.py index 2e3a291..07f4358 100644 --- a/Lib/test/test_json/test_scanstring.py +++ b/Lib/test/test_json/test_scanstring.py @@ -6,10 +6,6 @@ class TestScanstring: def test_scanstring(self): scanstring = self.json.decoder.scanstring self.assertEqual( - scanstring('"z\\ud834\\udd20x"', 1, True), - ('z\U0001d120x', 16)) - - self.assertEqual( scanstring('"z\U0001d120x"', 1, True), ('z\U0001d120x', 5)) @@ -89,6 +85,53 @@ class TestScanstring: scanstring('["Bad value", truth]', 2, True), ('Bad value', 12)) + def test_surrogates(self): + scanstring = self.json.decoder.scanstring + def assertScan(given, expect): + self.assertEqual(scanstring(given, 1, True), + (expect, len(given))) + + assertScan('"z\\ud834\\u0079x"', 'z\ud834yx') + assertScan('"z\\ud834\\udd20x"', 'z\U0001d120x') + assertScan('"z\\ud834\\ud834\\udd20x"', 'z\ud834\U0001d120x') + assertScan('"z\\ud834x"', 'z\ud834x') + assertScan('"z\\ud834\udd20x12345"', 'z\ud834\udd20x12345') + assertScan('"z\\udd20x"', 'z\udd20x') + assertScan('"z\ud834\udd20x"', 'z\ud834\udd20x') + assertScan('"z\ud834\\udd20x"', 'z\ud834\udd20x') + assertScan('"z\ud834x"', 'z\ud834x') + + def test_bad_escapes(self): + scanstring = self.json.decoder.scanstring + bad_escapes = [ + '"\\"', + '"\\x"', + '"\\u"', + '"\\u0"', + '"\\u01"', + '"\\u012"', + '"\\uz012"', + '"\\u0z12"', + '"\\u01z2"', + '"\\u012z"', + '"\\u0x12"', + '"\\u0X12"', + '"\\ud834\\"', + '"\\ud834\\u"', + '"\\ud834\\ud"', + '"\\ud834\\udd"', + '"\\ud834\\udd2"', + '"\\ud834\\uzdd2"', + '"\\ud834\\udzd2"', + '"\\ud834\\uddz2"', + '"\\ud834\\udd2z"', + '"\\ud834\\u0x20"', + '"\\ud834\\u0X20"', + ] + for s in bad_escapes: + with self.assertRaises(ValueError, msg=s): + scanstring(s, 1, True) + def test_overflow(self): with self.assertRaises(OverflowError): self.json.decoder.scanstring(b"xxx", sys.maxsize+1) @@ -16,6 +16,8 @@ Core and Builtins Library ------- +- Issue #11489: JSON decoder now accepts lone surrogates. + - Issue #19545: Avoid chained exceptions while passing stray % to time.strptime(). Initial patch by Claudiu Popa. diff --git a/Modules/_json.c b/Modules/_json.c index 301bc87..125101f 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -409,17 +409,10 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next } } /* Surrogate pair */ - if (Py_UNICODE_IS_HIGH_SURROGATE(c)) { + if (Py_UNICODE_IS_HIGH_SURROGATE(c) && end + 6 < len && + PyUnicode_READ(kind, buf, next++) == '\\' && + PyUnicode_READ(kind, buf, next++) == 'u') { Py_UCS4 c2 = 0; - if (end + 6 >= len) { - raise_errmsg("Unpaired high surrogate", pystr, end - 5); - goto bail; - } - if (PyUnicode_READ(kind, buf, next++) != '\\' || - PyUnicode_READ(kind, buf, next++) != 'u') { - raise_errmsg("Unpaired high surrogate", pystr, end - 5); - goto bail; - } end += 6; /* Decode 4 hex digits */ for (; next < end; next++) { @@ -440,15 +433,10 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next goto bail; } } - if (!Py_UNICODE_IS_LOW_SURROGATE(c2)) { - raise_errmsg("Unpaired high surrogate", pystr, end - 5); - goto bail; - } - c = Py_UNICODE_JOIN_SURROGATES(c, c2); - } - else if (Py_UNICODE_IS_LOW_SURROGATE(c)) { - raise_errmsg("Unpaired low surrogate", pystr, end - 5); - goto bail; + if (Py_UNICODE_IS_LOW_SURROGATE(c2)) + c = Py_UNICODE_JOIN_SURROGATES(c, c2); + else + end -= 6; } } APPEND_OLD_CHUNK |