summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2013-11-26 19:27:11 (GMT)
committerSerhiy Storchaka <storchaka@gmail.com>2013-11-26 19:27:11 (GMT)
commit687ff0ecdf9eb574c3553eee2a8492668cfa84ef (patch)
tree19e87329763348558f5e0a92b3e396f078dd6b1a
parent1df88677e96f258a917b1cec0940ea98aeccaa72 (diff)
parentc93329b3dd6dde3de76f473f5573233cb0366d9c (diff)
downloadcpython-687ff0ecdf9eb574c3553eee2a8492668cfa84ef.zip
cpython-687ff0ecdf9eb574c3553eee2a8492668cfa84ef.tar.gz
cpython-687ff0ecdf9eb574c3553eee2a8492668cfa84ef.tar.bz2
Issue #11489: JSON decoder now accepts lone surrogates.
-rw-r--r--Lib/json/decoder.py35
-rw-r--r--Lib/test/test_json/test_scanstring.py51
-rw-r--r--Misc/NEWS2
-rw-r--r--Modules/_json.c26
4 files changed, 73 insertions, 41 deletions
diff --git a/Lib/json/decoder.py b/Lib/json/decoder.py
index da7ef9c..59e5f41 100644
--- a/Lib/json/decoder.py
+++ b/Lib/json/decoder.py
@@ -58,6 +58,16 @@ BACKSLASH = {
'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t',
}
+def _decode_uXXXX(s, pos):
+ esc = s[pos + 1:pos + 5]
+ if len(esc) == 4 and esc[1] not in 'xX':
+ try:
+ return int(esc, 16)
+ except ValueError:
+ pass
+ msg = "Invalid \\uXXXX escape"
+ raise ValueError(errmsg(msg, s, pos))
+
def py_scanstring(s, end, strict=True,
_b=BACKSLASH, _m=STRINGCHUNK.match):
"""Scan the string s for a JSON string. End is the index of the
@@ -107,25 +117,14 @@ def py_scanstring(s, end, strict=True,
raise ValueError(errmsg(msg, s, end))
end += 1
else:
- esc = s[end + 1:end + 5]
- next_end = end + 5
- if len(esc) != 4:
- msg = "Invalid \\uXXXX escape"
- raise ValueError(errmsg(msg, s, end))
- uni = int(esc, 16)
- if 0xd800 <= uni <= 0xdbff:
- msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
- if not s[end + 5:end + 7] == '\\u':
- raise ValueError(errmsg(msg, s, end))
- esc2 = s[end + 7:end + 11]
- if len(esc2) != 4:
- raise ValueError(errmsg(msg, s, end))
- uni2 = int(esc2, 16)
- uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
- next_end += 6
+ uni = _decode_uXXXX(s, end)
+ end += 5
+ if 0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u':
+ uni2 = _decode_uXXXX(s, end + 1)
+ if 0xdc00 <= uni2 <= 0xdfff:
+ uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
+ end += 6
char = chr(uni)
-
- end = next_end
_append(char)
return ''.join(chunks), end
diff --git a/Lib/test/test_json/test_scanstring.py b/Lib/test/test_json/test_scanstring.py
index 2e3a291..07f4358 100644
--- a/Lib/test/test_json/test_scanstring.py
+++ b/Lib/test/test_json/test_scanstring.py
@@ -6,10 +6,6 @@ class TestScanstring:
def test_scanstring(self):
scanstring = self.json.decoder.scanstring
self.assertEqual(
- scanstring('"z\\ud834\\udd20x"', 1, True),
- ('z\U0001d120x', 16))
-
- self.assertEqual(
scanstring('"z\U0001d120x"', 1, True),
('z\U0001d120x', 5))
@@ -89,6 +85,53 @@ class TestScanstring:
scanstring('["Bad value", truth]', 2, True),
('Bad value', 12))
+ def test_surrogates(self):
+ scanstring = self.json.decoder.scanstring
+ def assertScan(given, expect):
+ self.assertEqual(scanstring(given, 1, True),
+ (expect, len(given)))
+
+ assertScan('"z\\ud834\\u0079x"', 'z\ud834yx')
+ assertScan('"z\\ud834\\udd20x"', 'z\U0001d120x')
+ assertScan('"z\\ud834\\ud834\\udd20x"', 'z\ud834\U0001d120x')
+ assertScan('"z\\ud834x"', 'z\ud834x')
+ assertScan('"z\\ud834\udd20x12345"', 'z\ud834\udd20x12345')
+ assertScan('"z\\udd20x"', 'z\udd20x')
+ assertScan('"z\ud834\udd20x"', 'z\ud834\udd20x')
+ assertScan('"z\ud834\\udd20x"', 'z\ud834\udd20x')
+ assertScan('"z\ud834x"', 'z\ud834x')
+
+ def test_bad_escapes(self):
+ scanstring = self.json.decoder.scanstring
+ bad_escapes = [
+ '"\\"',
+ '"\\x"',
+ '"\\u"',
+ '"\\u0"',
+ '"\\u01"',
+ '"\\u012"',
+ '"\\uz012"',
+ '"\\u0z12"',
+ '"\\u01z2"',
+ '"\\u012z"',
+ '"\\u0x12"',
+ '"\\u0X12"',
+ '"\\ud834\\"',
+ '"\\ud834\\u"',
+ '"\\ud834\\ud"',
+ '"\\ud834\\udd"',
+ '"\\ud834\\udd2"',
+ '"\\ud834\\uzdd2"',
+ '"\\ud834\\udzd2"',
+ '"\\ud834\\uddz2"',
+ '"\\ud834\\udd2z"',
+ '"\\ud834\\u0x20"',
+ '"\\ud834\\u0X20"',
+ ]
+ for s in bad_escapes:
+ with self.assertRaises(ValueError, msg=s):
+ scanstring(s, 1, True)
+
def test_overflow(self):
with self.assertRaises(OverflowError):
self.json.decoder.scanstring(b"xxx", sys.maxsize+1)
diff --git a/Misc/NEWS b/Misc/NEWS
index c484855..a3f3309 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -16,6 +16,8 @@ Core and Builtins
Library
-------
+- Issue #11489: JSON decoder now accepts lone surrogates.
+
- Issue #19545: Avoid chained exceptions while passing stray % to
time.strptime(). Initial patch by Claudiu Popa.
diff --git a/Modules/_json.c b/Modules/_json.c
index 301bc87..125101f 100644
--- a/Modules/_json.c
+++ b/Modules/_json.c
@@ -409,17 +409,10 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
}
}
/* Surrogate pair */
- if (Py_UNICODE_IS_HIGH_SURROGATE(c)) {
+ if (Py_UNICODE_IS_HIGH_SURROGATE(c) && end + 6 < len &&
+ PyUnicode_READ(kind, buf, next++) == '\\' &&
+ PyUnicode_READ(kind, buf, next++) == 'u') {
Py_UCS4 c2 = 0;
- if (end + 6 >= len) {
- raise_errmsg("Unpaired high surrogate", pystr, end - 5);
- goto bail;
- }
- if (PyUnicode_READ(kind, buf, next++) != '\\' ||
- PyUnicode_READ(kind, buf, next++) != 'u') {
- raise_errmsg("Unpaired high surrogate", pystr, end - 5);
- goto bail;
- }
end += 6;
/* Decode 4 hex digits */
for (; next < end; next++) {
@@ -440,15 +433,10 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
goto bail;
}
}
- if (!Py_UNICODE_IS_LOW_SURROGATE(c2)) {
- raise_errmsg("Unpaired high surrogate", pystr, end - 5);
- goto bail;
- }
- c = Py_UNICODE_JOIN_SURROGATES(c, c2);
- }
- else if (Py_UNICODE_IS_LOW_SURROGATE(c)) {
- raise_errmsg("Unpaired low surrogate", pystr, end - 5);
- goto bail;
+ if (Py_UNICODE_IS_LOW_SURROGATE(c2))
+ c = Py_UNICODE_JOIN_SURROGATES(c, c2);
+ else
+ end -= 6;
}
}
APPEND_OLD_CHUNK