From c4a35daa9771fbdcc58cad4656b8680541623e0e Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sun, 30 Oct 2016 23:00:01 +0200 Subject: Issue #28541: Improve test coverage for encoding detection in json library. Original patch by Eric Appelt. --- Lib/json/__init__.py | 3 ++- Lib/test/test_json/test_unicode.py | 13 +++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/Lib/json/__init__.py b/Lib/json/__init__.py index 8dcc678..94397aa 100644 --- a/Lib/json/__init__.py +++ b/Lib/json/__init__.py @@ -257,7 +257,8 @@ def detect_encoding(b): return 'utf-16-be' if b[1] else 'utf-32-be' if not b[1]: # XX 00 00 00 - utf-32-le - # XX 00 XX XX - utf-16-le + # XX 00 00 XX - utf-16-le + # XX 00 XX -- - utf-16-le return 'utf-16-le' if b[2] or b[3] else 'utf-32-le' elif len(b) == 2: if not b[0]: diff --git a/Lib/test/test_json/test_unicode.py b/Lib/test/test_json/test_unicode.py index eda177a..2e8bba2 100644 --- a/Lib/test/test_json/test_unicode.py +++ b/Lib/test/test_json/test_unicode.py @@ -65,6 +65,19 @@ class TestUnicode: self.assertEqual(self.loads(bom + encoded), data) self.assertEqual(self.loads(encoded), data) self.assertRaises(UnicodeDecodeError, self.loads, b'["\x80"]') + # RFC-7159 and ECMA-404 extend JSON to allow documents that + # consist of only a string, which can present a special case + # not covered by the encoding detection patterns specified in + # RFC-4627 for utf-16-le (XX 00 XX 00). + self.assertEqual(self.loads('"\u2600"'.encode('utf-16-le')), + '\u2600') + # Encoding detection for small (<4) bytes objects + # is implemented as a special case. RFC-7159 and ECMA-404 + # allow single codepoint JSON documents which are only two + # bytes in utf-16 encodings w/o BOM. + self.assertEqual(self.loads(b'5\x00'), 5) + self.assertEqual(self.loads(b'\x007'), 7) + self.assertEqual(self.loads(b'57'), 57) def test_object_pairs_hook_with_unicode(self): s = '{"xkd":1, "kcw":2, "art":3, "hxm":4, "qrt":5, "pad":6, "hoy":7}' -- cgit v0.12