Issue #12016: Multibyte CJK decoders now resynchronize faster

They only ignore the first byte of an invalid byte sequence. For example, b'\xff\n'.decode('gb2312', 'replace') gives '\ufffd\n' instead of '\ufffd'.
author: Victor Stinner <victor.stinner@haypocalc.com> 2011-07-07 23:45:13 (GMT)
committer: Victor Stinner <victor.stinner@haypocalc.com> 2011-07-07 23:45:13 (GMT)
commit: 2cded9c3f31d2fea4b033f44eaa828e508f03391 (patch)
tree: 1554d9f0baa575b7ae791ff1267c4e493a1b36bf
parent: 081fe46ff96bccb1a256c356443b625b467814c8 (diff)
download: cpython-2cded9c3f31d2fea4b033f44eaa828e508f03391.zip
cpython-2cded9c3f31d2fea4b033f44eaa828e508f03391.tar.gz
cpython-2cded9c3f31d2fea4b033f44eaa828e508f03391.tar.bz2
13 files changed, 159 insertions, 93 deletions
diff --git a/Doc/whatsnew/3.3.rst b/Doc/whatsnew/3.3.rst
index e5e1805..990085e 100644
--- a/Doc/whatsnew/3.3.rst
+++ b/Doc/whatsnew/3.3.rst
@@ -68,6 +68,29 @@ New, Improved, and Deprecated Modules
 
 * Stub
 
+codecs
+------
+
+Multibyte CJK decoders now resynchronize faster. They only ignore the first
+byte of an invalid byte sequence. For example, b'\xff\n'.decode('gb2312',
+'replace') gives '�\n' instead of '�'.
+
+(http://bugs.python.org/issue12016)
+
+Don't reset incremental encoders of CJK codecs at each call to their encode()
+method anymore. For example: ::
+
+    $ ./python -q
+    >>> import codecs
+    >>> encoder = codecs.getincrementalencoder('hz')('strict')
+    >>> b''.join(encoder.encode(x) for x in '\u52ff\u65bd\u65bc\u4eba\u3002 Bye.')
+    b'~{NpJ)l6HK!#~} Bye.'
+
+This example gives b'~{Np~}~{J)~}~{l6~}~{HK~}~{!#~} Bye.' with older Python
+versions.
+
+(http://bugs.python.org/issue12100)
+
 faulthandler
 ------------
 
diff --git a/Lib/test/test_codecencodings_cn.py b/Lib/test/test_codecencodings_cn.py
index dca9f10..ee3d165 100644
--- a/Lib/test/test_codecencodings_cn.py
+++ b/Lib/test/test_codecencodings_cn.py
@@ -15,8 +15,8 @@ class Test_GB2312(test_multibytecodec_support.TestBase, unittest.TestCase):
         # invalid bytes
         (b"abc\x81\x81\xc1\xc4", "strict",  None),
         (b"abc\xc8", "strict",  None),
-        (b"abc\x81\x81\xc1\xc4", "replace", "abc\ufffd\u804a"),
-        (b"abc\x81\x81\xc1\xc4\xc8", "replace", "abc\ufffd\u804a\ufffd"),
+        (b"abc\x81\x81\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"),
+        (b"abc\x81\x81\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"),
         (b"abc\x81\x81\xc1\xc4", "ignore",  "abc\u804a"),
         (b"\xc1\x64", "strict", None),
     )
@@ -28,8 +28,8 @@ class Test_GBK(test_multibytecodec_support.TestBase, unittest.TestCase):
         # invalid bytes
         (b"abc\x80\x80\xc1\xc4", "strict",  None),
         (b"abc\xc8", "strict",  None),
-        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u804a"),
-        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u804a\ufffd"),
+        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"),
+        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"),
         (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u804a"),
         (b"\x83\x34\x83\x31", "strict", None),
         ("\u30fb", "strict", None),
@@ -42,11 +42,14 @@ class Test_GB18030(test_multibytecodec_support.TestBase, unittest.TestCase):
         # invalid bytes
         (b"abc\x80\x80\xc1\xc4", "strict",  None),
         (b"abc\xc8", "strict",  None),
-        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u804a"),
-        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u804a\ufffd"),
+        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"),
+        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"),
         (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u804a"),
-        (b"abc\x84\x39\x84\x39\xc1\xc4", "replace", "abc\ufffd\u804a"),
+        (b"abc\x84\x39\x84\x39\xc1\xc4", "replace", "abc\ufffd9\ufffd9\u804a"),
         ("\u30fb", "strict", b"\x819\xa79"),
+        (b"abc\x84\x32\x80\x80def", "replace", 'abc\ufffd2\ufffd\ufffddef'),
+        (b"abc\x81\x30\x81\x30def", "strict", 'abc\x80def'),
+        (b"abc\x86\x30\x81\x30def", "replace", 'abc\ufffd0\ufffd0def'),
     )
     has_iso10646 = True
 
@@ -74,9 +77,11 @@ class Test_HZ(test_multibytecodec_support.TestBase, unittest.TestCase):
          '\u5df1\u6240\u4e0d\u6b32\uff0c\u52ff\u65bd\u65bc\u4eba\u3002'
          'Bye.\n'),
         # invalid bytes
-        (b'ab~cd', 'replace', 'ab\uFFFDd'),
+        (b'ab~cd', 'replace', 'ab\uFFFDcd'),
         (b'ab\xffcd', 'replace', 'ab\uFFFDcd'),
         (b'ab~{\x81\x81\x41\x44~}cd', 'replace', 'ab\uFFFD\uFFFD\u804Acd'),
+        (b'ab~{\x41\x44~}cd', 'replace', 'ab\u804Acd'),
+        (b"ab~{\x79\x79\x41\x44~}cd", "replace", "ab\ufffd\ufffd\u804acd"),
     )
 
 def test_main():
diff --git a/Lib/test/test_codecencodings_hk.py b/Lib/test/test_codecencodings_hk.py
index ccdc0b4..520df43 100644
--- a/Lib/test/test_codecencodings_hk.py
+++ b/Lib/test/test_codecencodings_hk.py
@@ -15,8 +15,8 @@ class Test_Big5HKSCS(test_multibytecodec_support.TestBase, unittest.TestCase):
         # invalid bytes
         (b"abc\x80\x80\xc1\xc4", "strict",  None),
         (b"abc\xc8", "strict",  None),
-        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u8b10"),
-        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u8b10\ufffd"),
+        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u8b10"),
+        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u8b10\ufffd"),
         (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u8b10"),
     )
 
diff --git a/Lib/test/test_codecencodings_jp.py b/Lib/test/test_codecencodings_jp.py
index f56a373..87e4812 100644
--- a/Lib/test/test_codecencodings_jp.py
+++ b/Lib/test/test_codecencodings_jp.py
@@ -15,50 +15,57 @@ class Test_CP932(test_multibytecodec_support.TestBase, unittest.TestCase):
         # invalid bytes
         (b"abc\x81\x00\x81\x00\x82\x84", "strict",  None),
         (b"abc\xf8", "strict",  None),
-        (b"abc\x81\x00\x82\x84", "replace", "abc\ufffd\uff44"),
-        (b"abc\x81\x00\x82\x84\x88", "replace", "abc\ufffd\uff44\ufffd"),
-        (b"abc\x81\x00\x82\x84", "ignore",  "abc\uff44"),
+        (b"abc\x81\x00\x82\x84", "replace", "abc\ufffd\x00\uff44"),
+        (b"abc\x81\x00\x82\x84\x88", "replace", "abc\ufffd\x00\uff44\ufffd"),
+        (b"abc\x81\x00\x82\x84", "ignore",  "abc\x00\uff44"),
+        (b"ab\xEBxy", "replace", "ab\uFFFDxy"),
+        (b"ab\xF0\x39xy", "replace", "ab\uFFFD9xy"),
+        (b"ab\xEA\xF0xy", "replace", 'ab\ufffd\ue038y'),
         # sjis vs cp932
         (b"\\\x7e", "replace", "\\\x7e"),
         (b"\x81\x5f\x81\x61\x81\x7c", "replace", "\uff3c\u2225\uff0d"),
     )
 
+euc_commontests = (
+    # invalid bytes
+    (b"abc\x80\x80\xc1\xc4", "strict",  None),
+    (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u7956"),
+    (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u7956\ufffd"),
+    (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u7956"),
+    (b"abc\xc8", "strict",  None),
+    (b"abc\x8f\x83\x83", "replace", "abc\ufffd\ufffd\ufffd"),
+    (b"\x82\xFCxy", "replace", "\ufffd\ufffdxy"),
+    (b"\xc1\x64", "strict", None),
+    (b"\xa1\xc0", "strict", "\uff3c"),
+    (b"\xa1\xc0\\", "strict", "\uff3c\\"),
+    (b"\x8eXY", "replace", "\ufffdXY"),
+)
+
+class Test_EUC_JIS_2004(test_multibytecodec_support.TestBase,
+                        unittest.TestCase):
+    encoding = 'euc_jis_2004'
+    tstring = test_multibytecodec_support.load_teststring('euc_jisx0213')
+    codectests = euc_commontests
+    xmlcharnametest = (
+        "\xab\u211c\xbb = \u2329\u1234\u232a",
+        b"\xa9\xa8&real;\xa9\xb2 = &lang;&#4660;&rang;"
+    )
+
 class Test_EUC_JISX0213(test_multibytecodec_support.TestBase,
                         unittest.TestCase):
     encoding = 'euc_jisx0213'
     tstring = test_multibytecodec_support.load_teststring('euc_jisx0213')
-    codectests = (
-        # invalid bytes
-        (b"abc\x80\x80\xc1\xc4", "strict",  None),
-        (b"abc\xc8", "strict",  None),
-        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u7956"),
-        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u7956\ufffd"),
-        (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u7956"),
-        (b"abc\x8f\x83\x83", "replace", "abc\ufffd"),
-        (b"\xc1\x64", "strict", None),
-        (b"\xa1\xc0", "strict", "\uff3c"),
-    )
+    codectests = euc_commontests
     xmlcharnametest = (
         "\xab\u211c\xbb = \u2329\u1234\u232a",
         b"\xa9\xa8&real;\xa9\xb2 = &lang;&#4660;&rang;"
     )
 
-eucjp_commontests = (
-    (b"abc\x80\x80\xc1\xc4", "strict",  None),
-    (b"abc\xc8", "strict",  None),
-    (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u7956"),
-    (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u7956\ufffd"),
-    (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u7956"),
-    (b"abc\x8f\x83\x83", "replace", "abc\ufffd"),
-    (b"\xc1\x64", "strict", None),
-)
-
 class Test_EUC_JP_COMPAT(test_multibytecodec_support.TestBase,
                          unittest.TestCase):
     encoding = 'euc_jp'
     tstring = test_multibytecodec_support.load_teststring('euc_jp')
-    codectests = eucjp_commontests + (
-        (b"\xa1\xc0\\", "strict", "\uff3c\\"),
+    codectests = euc_commontests + (
         ("\xa5", "strict", b"\x5c"),
         ("\u203e", "strict", b"\x7e"),
     )
@@ -66,8 +73,6 @@ class Test_EUC_JP_COMPAT(test_multibytecodec_support.TestBase,
 shiftjis_commonenctests = (
     (b"abc\x80\x80\x82\x84", "strict",  None),
     (b"abc\xf8", "strict",  None),
-    (b"abc\x80\x80\x82\x84", "replace", "abc\ufffd\uff44"),
-    (b"abc\x80\x80\x82\x84\x88", "replace", "abc\ufffd\uff44\ufffd"),
     (b"abc\x80\x80\x82\x84def", "ignore",  "abc\uff44def"),
 )
 
@@ -75,20 +80,41 @@ class Test_SJIS_COMPAT(test_multibytecodec_support.TestBase, unittest.TestCase):
     encoding = 'shift_jis'
     tstring = test_multibytecodec_support.load_teststring('shift_jis')
     codectests = shiftjis_commonenctests + (
+        (b"abc\x80\x80\x82\x84", "replace", "abc\ufffd\ufffd\uff44"),
+        (b"abc\x80\x80\x82\x84\x88", "replace", "abc\ufffd\ufffd\uff44\ufffd"),
+
         (b"\\\x7e", "strict", "\\\x7e"),
         (b"\x81\x5f\x81\x61\x81\x7c", "strict", "\uff3c\u2016\u2212"),
+        (b"abc\x81\x39", "replace",  "abc\ufffd9"),
+        (b"abc\xEA\xFC", "replace",  "abc\ufffd\ufffd"),
+        (b"abc\xFF\x58", "replace",  "abc\ufffdX"),
+    )
+
+class Test_SJIS_2004(test_multibytecodec_support.TestBase, unittest.TestCase):
+    encoding = 'shift_jis_2004'
+    tstring = test_multibytecodec_support.load_teststring('shift_jis')
+    codectests = shiftjis_commonenctests + (
+        (b"\\\x7e", "strict", "\xa5\u203e"),
+        (b"\x81\x5f\x81\x61\x81\x7c", "strict", "\\\u2016\u2212"),
+        (b"abc\xEA\xFC", "strict",  "abc\u64bf"),
+        (b"\x81\x39xy", "replace",  "\ufffd9xy"),
+        (b"\xFF\x58xy", "replace",  "\ufffdXxy"),
+        (b"\x80\x80\x82\x84xy", "replace", "\ufffd\ufffd\uff44xy"),
+        (b"\x80\x80\x82\x84\x88xy", "replace", "\ufffd\ufffd\uff44\u5864y"),
+        (b"\xFC\xFBxy", "replace", '\ufffd\u95b4y'),
+    )
+    xmlcharnametest = (
+        "\xab\u211c\xbb = \u2329\u1234\u232a",
+        b"\x85G&real;\x85Q = &lang;&#4660;&rang;"
     )
 
 class Test_SJISX0213(test_multibytecodec_support.TestBase, unittest.TestCase):
     encoding = 'shift_jisx0213'
     tstring = test_multibytecodec_support.load_teststring('shift_jisx0213')
-    codectests = (
-        # invalid bytes
-        (b"abc\x80\x80\x82\x84", "strict",  None),
-        (b"abc\xf8", "strict",  None),
-        (b"abc\x80\x80\x82\x84", "replace", "abc\ufffd\uff44"),
-        (b"abc\x80\x80\x82\x84\x88", "replace", "abc\ufffd\uff44\ufffd"),
-        (b"abc\x80\x80\x82\x84def", "ignore",  "abc\uff44def"),
+    codectests = shiftjis_commonenctests + (
+        (b"abc\x80\x80\x82\x84", "replace", "abc\ufffd\ufffd\uff44"),
+        (b"abc\x80\x80\x82\x84\x88", "replace", "abc\ufffd\ufffd\uff44\ufffd"),
+
         # sjis vs cp932
         (b"\\\x7e", "replace", "\xa5\u203e"),
         (b"\x81\x5f\x81\x61\x81\x7c", "replace", "\x5c\u2016\u2212"),
diff --git a/Lib/test/test_codecencodings_kr.py b/Lib/test/test_codecencodings_kr.py
index de4da7f..4997e83 100644
--- a/Lib/test/test_codecencodings_kr.py
+++ b/Lib/test/test_codecencodings_kr.py
@@ -15,8 +15,8 @@ class Test_CP949(test_multibytecodec_support.TestBase, unittest.TestCase):
         # invalid bytes
         (b"abc\x80\x80\xc1\xc4", "strict",  None),
         (b"abc\xc8", "strict",  None),
-        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\uc894"),
-        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\uc894\ufffd"),
+        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\uc894"),
+        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\uc894\ufffd"),
         (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\uc894"),
     )
 
@@ -27,8 +27,8 @@ class Test_EUCKR(test_multibytecodec_support.TestBase, unittest.TestCase):
         # invalid bytes
         (b"abc\x80\x80\xc1\xc4", "strict",  None),
         (b"abc\xc8", "strict",  None),
-        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\uc894"),
-        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\uc894\ufffd"),
+        (b"abc\x80\x80\xc1\xc4", "replace", 'abc\ufffd\ufffd\uc894'),
+        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\uc894\ufffd"),
         (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\uc894"),
 
         # composed make-up sequence errors
@@ -40,13 +40,14 @@ class Test_EUCKR(test_multibytecodec_support.TestBase, unittest.TestCase):
         (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4", "strict", None),
         (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xd4", "strict", "\uc4d4"),
         (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xd4x", "strict", "\uc4d4x"),
-        (b"a\xa4\xd4\xa4\xb6\xa4", "replace", "a\ufffd"),
+        (b"a\xa4\xd4\xa4\xb6\xa4", "replace", 'a\ufffd'),
         (b"\xa4\xd4\xa3\xb6\xa4\xd0\xa4\xd4", "strict", None),
         (b"\xa4\xd4\xa4\xb6\xa3\xd0\xa4\xd4", "strict", None),
         (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa3\xd4", "strict", None),
-        (b"\xa4\xd4\xa4\xff\xa4\xd0\xa4\xd4", "replace", "\ufffd"),
-        (b"\xa4\xd4\xa4\xb6\xa4\xff\xa4\xd4", "replace", "\ufffd"),
-        (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xff", "replace", "\ufffd"),
+        (b"\xa4\xd4\xa4\xff\xa4\xd0\xa4\xd4", "replace", '\ufffd\u6e21\ufffd\u3160\ufffd'),
+        (b"\xa4\xd4\xa4\xb6\xa4\xff\xa4\xd4", "replace", '\ufffd\u6e21\ub544\ufffd\ufffd'),
+        (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xff", "replace", '\ufffd\u6e21\ub544\u572d\ufffd'),
+        (b"\xa4\xd4\xff\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xd4", "replace", '\ufffd\ufffd\ufffd\uc4d4'),
         (b"\xc1\xc4", "strict", "\uc894"),
     )
 
@@ -57,9 +58,13 @@ class Test_JOHAB(test_multibytecodec_support.TestBase, unittest.TestCase):
         # invalid bytes
         (b"abc\x80\x80\xc1\xc4", "strict",  None),
         (b"abc\xc8", "strict",  None),
-        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ucd27"),
-        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ucd27\ufffd"),
+        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\ucd27"),
+        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\ucd27\ufffd"),
         (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\ucd27"),
+        (b"\xD8abc", "replace",  "\uFFFDabc"),
+        (b"\xD8\xFFabc", "replace",  "\uFFFD\uFFFDabc"),
+        (b"\x84bxy", "replace",  "\uFFFDbxy"),
+        (b"\x8CBxy", "replace",  "\uFFFDBxy"),
     )
 
 def test_main():
diff --git a/Lib/test/test_codecencodings_tw.py b/Lib/test/test_codecencodings_tw.py
index 12d3c9f..f2f3c18 100644
--- a/Lib/test/test_codecencodings_tw.py
+++ b/Lib/test/test_codecencodings_tw.py
@@ -15,8 +15,8 @@ class Test_Big5(test_multibytecodec_support.TestBase, unittest.TestCase):
         # invalid bytes
         (b"abc\x80\x80\xc1\xc4", "strict",  None),
         (b"abc\xc8", "strict",  None),
-        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u8b10"),
-        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u8b10\ufffd"),
+        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u8b10"),
+        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u8b10\ufffd"),
         (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u8b10"),
     )
 
diff --git a/Lib/test/test_codecmaps_tw.py b/Lib/test/test_codecmaps_tw.py
index 6db5091..412b9de 100644
--- a/Lib/test/test_codecmaps_tw.py
+++ b/Lib/test/test_codecmaps_tw.py
@@ -23,6 +23,9 @@ class TestCP950Map(test_multibytecodec_support.TestBase_Mapping,
         (b'\xa2\xcc', '\u5341'),
         (b'\xa2\xce', '\u5345'),
     ]
+    codectests = (
+        (b"\xFFxy", "replace",  "\ufffdxy"),
+    )
 
 def test_main():
     support.run_unittest(__name__)
diff --git a/Misc/NEWS b/Misc/NEWS
index f742f8a..404185d 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -219,6 +219,10 @@ Core and Builtins
 Library
 -------
 
+- Issue #12016: Multibyte CJK decoders now resynchronize faster. They only
+  ignore the first byte of an invalid byte sequence. For example,
+  b'\xff\n'.decode('gb2312', 'replace') gives '\ufffd\n' instead of '\ufffd'.
+
 - Issue #12459: time.sleep() now raises a ValueError if the sleep length is
   negative, instead of an infinite sleep on Windows or raising an IOError on
   Linux for example, to have the same behaviour on all platforms.
diff --git a/Modules/cjkcodecs/_codecs_cn.c b/Modules/cjkcodecs/_codecs_cn.c
index ab4e659..9e9e96c 100644
--- a/Modules/cjkcodecs/_codecs_cn.c
+++ b/Modules/cjkcodecs/_codecs_cn.c
@@ -85,7 +85,7 @@ DECODER(gb2312)
         TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
             NEXT(2, 1)
         }
-        else return 2;
+        else return 1;
     }
 
     return 0;
@@ -141,7 +141,7 @@ DECODER(gbk)
         REQUIRE_INBUF(2)
 
         GBK_DECODE(c, IN2, **outbuf)
-        else return 2;
+        else return 1;
 
         NEXT(2, 1)
     }
@@ -267,7 +267,7 @@ DECODER(gb18030)
             c3 = IN3;
             c4 = IN4;
             if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39)
-                return 4;
+                return 1;
             c -= 0x81;  c2 -= 0x30;
             c3 -= 0x81; c4 -= 0x30;
 
@@ -292,12 +292,12 @@ DECODER(gb18030)
                     continue;
                 }
             }
-            return 4;
+            return 1;
         }
 
         GBK_DECODE(c, c2, **outbuf)
         else TRYMAP_DEC(gb18030ext, **outbuf, c, c2);
-        else return 2;
+        else return 1;
 
         NEXT(2, 1)
     }
@@ -400,7 +400,7 @@ DECODER(hz)
             else if (c2 == '\n')
                 ; /* line-continuation */
             else
-                return 2;
+                return 1;
             NEXT(2, 0);
             continue;
         }
@@ -419,7 +419,7 @@ DECODER(hz)
                 NEXT(2, 1)
             }
             else
-                return 2;
+                return 1;
         }
     }
 
diff --git a/Modules/cjkcodecs/_codecs_hk.c b/Modules/cjkcodecs/_codecs_hk.c
index 558a42f..d3ad04b 100644
--- a/Modules/cjkcodecs/_codecs_hk.c
+++ b/Modules/cjkcodecs/_codecs_hk.c
@@ -161,7 +161,7 @@ DECODER(big5hkscs)
         case 0x8864: WRITE2(0x00ca, 0x030c); break;
         case 0x88a3: WRITE2(0x00ea, 0x0304); break;
         case 0x88a5: WRITE2(0x00ea, 0x030c); break;
-        default: return 2;
+        default: return 1;
         }
 
         NEXT(2, 2) /* all decoded codepoints are pairs, above. */
diff --git a/Modules/cjkcodecs/_codecs_jp.c b/Modules/cjkcodecs/_codecs_jp.c
index a05e01b..a500696 100644
--- a/Modules/cjkcodecs/_codecs_jp.c
+++ b/Modules/cjkcodecs/_codecs_jp.c
@@ -112,7 +112,7 @@ DECODER(cp932)
         TRYMAP_DEC(cp932ext, **outbuf, c, c2);
         else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){
             if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
-                return 2;
+                return 1;
 
             c = (c < 0xe0 ? c - 0x81 : c - 0xc1);
             c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
@@ -120,7 +120,7 @@ DECODER(cp932)
             c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21;
 
             TRYMAP_DEC(jisx0208, **outbuf, c, c2);
-            else return 2;
+            else return 1;
         }
         else if (c >= 0xf0 && c <= 0xf9) {
             if ((c2 >= 0x40 && c2 <= 0x7e) ||
@@ -128,10 +128,10 @@ DECODER(cp932)
                 OUT1(0xe000 + 188 * (c - 0xf0) +
                      (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41))
             else
-                return 2;
+                return 1;
         }
         else
-            return 2;
+            return 1;
 
         NEXT(2, 1)
     }
@@ -256,7 +256,7 @@ DECODER(euc_jis_2004)
                 NEXT(2, 1)
             }
             else
-                return 2;
+                return 1;
         }
         else if (c == 0x8f) {
             unsigned char c2, c3;
@@ -274,7 +274,7 @@ DECODER(euc_jis_2004)
                 continue;
             }
             else TRYMAP_DEC(jisx0212, **outbuf, c2, c3) ;
-            else return 3;
+            else return 1;
             NEXT(3, 1)
         }
         else {
@@ -300,7 +300,7 @@ DECODER(euc_jis_2004)
                 NEXT(2, 2)
                 continue;
             }
-            else return 2;
+            else return 1;
             NEXT(2, 1)
         }
     }
@@ -388,7 +388,7 @@ DECODER(euc_jp)
                 NEXT(2, 1)
             }
             else
-                return 2;
+                return 1;
         }
         else if (c == 0x8f) {
             unsigned char c2, c3;
@@ -401,7 +401,7 @@ DECODER(euc_jp)
                 NEXT(3, 1)
             }
             else
-                return 3;
+                return 1;
         }
         else {
             unsigned char c2;
@@ -417,7 +417,7 @@ DECODER(euc_jp)
 #endif
                 TRYMAP_DEC(jisx0208, **outbuf,
                            c ^ 0x80, c2 ^ 0x80) ;
-            else return 2;
+            else return 1;
             NEXT(2, 1)
         }
     }
@@ -502,7 +502,7 @@ DECODER(shift_jis)
             REQUIRE_INBUF(2)
             c2 = IN2;
             if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
-                return 2;
+                return 1;
 
             c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1);
             c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
@@ -522,10 +522,10 @@ DECODER(shift_jis)
                 continue;
             }
             else
-                return 2;
+                return 1;
         }
         else
-            return 2;
+            return 1;
 
         NEXT(1, 1) /* JIS X 0201 */
     }
@@ -645,7 +645,7 @@ DECODER(shift_jis_2004)
             REQUIRE_INBUF(2)
             c2 = IN2;
             if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
-                return 2;
+                return 1;
 
             c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1);
             c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
@@ -671,7 +671,7 @@ DECODER(shift_jis_2004)
                     NEXT_OUT(2)
                 }
                 else
-                    return 2;
+                    return 1;
                 NEXT_IN(2)
             }
             else { /* Plane 2 */
@@ -689,13 +689,13 @@ DECODER(shift_jis_2004)
                     continue;
                 }
                 else
-                    return 2;
+                    return 1;
                 NEXT(2, 1)
             }
             continue;
         }
         else
-            return 2;
+            return 1;
 
         NEXT(1, 1) /* JIS X 0201 */
     }
diff --git a/Modules/cjkcodecs/_codecs_kr.c b/Modules/cjkcodecs/_codecs_kr.c
index 9272e36..f5697dd 100644
--- a/Modules/cjkcodecs/_codecs_kr.c
+++ b/Modules/cjkcodecs/_codecs_kr.c
@@ -123,7 +123,7 @@ DECODER(euc_kr)
             if ((*inbuf)[2] != EUCKR_JAMO_FIRSTBYTE ||
                 (*inbuf)[4] != EUCKR_JAMO_FIRSTBYTE ||
                 (*inbuf)[6] != EUCKR_JAMO_FIRSTBYTE)
-                return 8;
+                return 1;
 
             c = (*inbuf)[3];
             if (0xa1 <= c && c <= 0xbe)
@@ -143,7 +143,7 @@ DECODER(euc_kr)
                 jong = NONE;
 
             if (cho == NONE || jung == NONE || jong == NONE)
-                return 8;
+                return 1;
 
             OUT1(0xac00 + cho*588 + jung*28 + jong);
             NEXT(8, 1)
@@ -152,7 +152,7 @@ DECODER(euc_kr)
             NEXT(2, 1)
         }
         else
-            return 2;
+            return 1;
     }
 
     return 0;
@@ -208,7 +208,7 @@ DECODER(cp949)
         REQUIRE_INBUF(2)
         TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80);
         else TRYMAP_DEC(cp949ext, **outbuf, c, IN2);
-        else return 2;
+        else return 1;
 
         NEXT(2, 1)
     }
@@ -375,7 +375,7 @@ DECODER(johab)
             i_jong = johabidx_jongseong[c_jong];
 
             if (i_cho == NONE || i_jung == NONE || i_jong == NONE)
-                return 2;
+                return 1;
 
             /* we don't use U+1100 hangul jamo yet. */
             if (i_cho == FILL) {
@@ -391,7 +391,7 @@ DECODER(johab)
                         OUT1(0x3100 |
                           johabjamo_jungseong[c_jung])
                     else
-                        return 2;
+                        return 1;
                 }
             } else {
                 if (i_jung == FILL) {
@@ -399,7 +399,7 @@ DECODER(johab)
                         OUT1(0x3100 |
                           johabjamo_choseong[c_cho])
                     else
-                        return 2;
+                        return 1;
                 }
                 else
                     OUT1(0xac00 +
@@ -414,7 +414,7 @@ DECODER(johab)
                 c2 < 0x31 || (c2 >= 0x80 && c2 < 0x91) ||
                 (c2 & 0x7f) == 0x7f ||
                 (c == 0xda && (c2 >= 0xa1 && c2 <= 0xd3)))
-                return 2;
+                return 1;
             else {
                 unsigned char t1, t2;
 
@@ -425,7 +425,7 @@ DECODER(johab)
                 t2 = (t2 < 0x5e ? t2 : t2 - 0x5e) + 0x21;
 
                 TRYMAP_DEC(ksx1001, **outbuf, t1, t2);
-                else return 2;
+                else return 1;
                 NEXT(2, 1)
             }
         }
diff --git a/Modules/cjkcodecs/_codecs_tw.c b/Modules/cjkcodecs/_codecs_tw.c
index 38cf723..916298d 100644
--- a/Modules/cjkcodecs/_codecs_tw.c
+++ b/Modules/cjkcodecs/_codecs_tw.c
@@ -55,7 +55,7 @@ DECODER(big5)
         TRYMAP_DEC(big5, **outbuf, c, IN2) {
             NEXT(2, 1)
         }
-        else return 2;
+        else return 1;
     }
 
     return 0;
@@ -109,7 +109,7 @@ DECODER(cp950)
 
         TRYMAP_DEC(cp950ext, **outbuf, c, IN2);
         else TRYMAP_DEC(big5, **outbuf, c, IN2);
-        else return 2;
+        else return 1;
 
         NEXT(2, 1)
     }
author	Victor Stinner <victor.stinner@haypocalc.com>	2011-07-07 23:45:13 (GMT)
committer	Victor Stinner <victor.stinner@haypocalc.com>	2011-07-07 23:45:13 (GMT)
commit	2cded9c3f31d2fea4b033f44eaa828e508f03391 (patch)
tree	1554d9f0baa575b7ae791ff1267c4e493a1b36bf
parent	081fe46ff96bccb1a256c356443b625b467814c8 (diff)
download	cpython-2cded9c3f31d2fea4b033f44eaa828e508f03391.zip cpython-2cded9c3f31d2fea4b033f44eaa828e508f03391.tar.gz cpython-2cded9c3f31d2fea4b033f44eaa828e508f03391.tar.bz2