summaryrefslogtreecommitdiffstats
path: root/Modules/cjkcodecs
diff options
context:
space:
mode:
authorVictor Stinner <victor.stinner@haypocalc.com>2011-07-07 23:45:13 (GMT)
committerVictor Stinner <victor.stinner@haypocalc.com>2011-07-07 23:45:13 (GMT)
commit2cded9c3f31d2fea4b033f44eaa828e508f03391 (patch)
tree1554d9f0baa575b7ae791ff1267c4e493a1b36bf /Modules/cjkcodecs
parent081fe46ff96bccb1a256c356443b625b467814c8 (diff)
downloadcpython-2cded9c3f31d2fea4b033f44eaa828e508f03391.zip
cpython-2cded9c3f31d2fea4b033f44eaa828e508f03391.tar.gz
cpython-2cded9c3f31d2fea4b033f44eaa828e508f03391.tar.bz2
Issue #12016: Multibyte CJK decoders now resynchronize faster
They only ignore the first byte of an invalid byte sequence. For example, b'\xff\n'.decode('gb2312', 'replace') gives '\ufffd\n' instead of '\ufffd'.
Diffstat (limited to 'Modules/cjkcodecs')
-rw-r--r--Modules/cjkcodecs/_codecs_cn.c14
-rw-r--r--Modules/cjkcodecs/_codecs_hk.c2
-rw-r--r--Modules/cjkcodecs/_codecs_jp.c34
-rw-r--r--Modules/cjkcodecs/_codecs_kr.c18
-rw-r--r--Modules/cjkcodecs/_codecs_tw.c4
5 files changed, 36 insertions, 36 deletions
diff --git a/Modules/cjkcodecs/_codecs_cn.c b/Modules/cjkcodecs/_codecs_cn.c
index ab4e659..9e9e96c 100644
--- a/Modules/cjkcodecs/_codecs_cn.c
+++ b/Modules/cjkcodecs/_codecs_cn.c
@@ -85,7 +85,7 @@ DECODER(gb2312)
TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
NEXT(2, 1)
}
- else return 2;
+ else return 1;
}
return 0;
@@ -141,7 +141,7 @@ DECODER(gbk)
REQUIRE_INBUF(2)
GBK_DECODE(c, IN2, **outbuf)
- else return 2;
+ else return 1;
NEXT(2, 1)
}
@@ -267,7 +267,7 @@ DECODER(gb18030)
c3 = IN3;
c4 = IN4;
if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39)
- return 4;
+ return 1;
c -= 0x81; c2 -= 0x30;
c3 -= 0x81; c4 -= 0x30;
@@ -292,12 +292,12 @@ DECODER(gb18030)
continue;
}
}
- return 4;
+ return 1;
}
GBK_DECODE(c, c2, **outbuf)
else TRYMAP_DEC(gb18030ext, **outbuf, c, c2);
- else return 2;
+ else return 1;
NEXT(2, 1)
}
@@ -400,7 +400,7 @@ DECODER(hz)
else if (c2 == '\n')
; /* line-continuation */
else
- return 2;
+ return 1;
NEXT(2, 0);
continue;
}
@@ -419,7 +419,7 @@ DECODER(hz)
NEXT(2, 1)
}
else
- return 2;
+ return 1;
}
}
diff --git a/Modules/cjkcodecs/_codecs_hk.c b/Modules/cjkcodecs/_codecs_hk.c
index 558a42f..d3ad04b 100644
--- a/Modules/cjkcodecs/_codecs_hk.c
+++ b/Modules/cjkcodecs/_codecs_hk.c
@@ -161,7 +161,7 @@ DECODER(big5hkscs)
case 0x8864: WRITE2(0x00ca, 0x030c); break;
case 0x88a3: WRITE2(0x00ea, 0x0304); break;
case 0x88a5: WRITE2(0x00ea, 0x030c); break;
- default: return 2;
+ default: return 1;
}
NEXT(2, 2) /* all decoded codepoints are pairs, above. */
diff --git a/Modules/cjkcodecs/_codecs_jp.c b/Modules/cjkcodecs/_codecs_jp.c
index a05e01b..a500696 100644
--- a/Modules/cjkcodecs/_codecs_jp.c
+++ b/Modules/cjkcodecs/_codecs_jp.c
@@ -112,7 +112,7 @@ DECODER(cp932)
TRYMAP_DEC(cp932ext, **outbuf, c, c2);
else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){
if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
- return 2;
+ return 1;
c = (c < 0xe0 ? c - 0x81 : c - 0xc1);
c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
@@ -120,7 +120,7 @@ DECODER(cp932)
c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21;
TRYMAP_DEC(jisx0208, **outbuf, c, c2);
- else return 2;
+ else return 1;
}
else if (c >= 0xf0 && c <= 0xf9) {
if ((c2 >= 0x40 && c2 <= 0x7e) ||
@@ -128,10 +128,10 @@ DECODER(cp932)
OUT1(0xe000 + 188 * (c - 0xf0) +
(c2 < 0x80 ? c2 - 0x40 : c2 - 0x41))
else
- return 2;
+ return 1;
}
else
- return 2;
+ return 1;
NEXT(2, 1)
}
@@ -256,7 +256,7 @@ DECODER(euc_jis_2004)
NEXT(2, 1)
}
else
- return 2;
+ return 1;
}
else if (c == 0x8f) {
unsigned char c2, c3;
@@ -274,7 +274,7 @@ DECODER(euc_jis_2004)
continue;
}
else TRYMAP_DEC(jisx0212, **outbuf, c2, c3) ;
- else return 3;
+ else return 1;
NEXT(3, 1)
}
else {
@@ -300,7 +300,7 @@ DECODER(euc_jis_2004)
NEXT(2, 2)
continue;
}
- else return 2;
+ else return 1;
NEXT(2, 1)
}
}
@@ -388,7 +388,7 @@ DECODER(euc_jp)
NEXT(2, 1)
}
else
- return 2;
+ return 1;
}
else if (c == 0x8f) {
unsigned char c2, c3;
@@ -401,7 +401,7 @@ DECODER(euc_jp)
NEXT(3, 1)
}
else
- return 3;
+ return 1;
}
else {
unsigned char c2;
@@ -417,7 +417,7 @@ DECODER(euc_jp)
#endif
TRYMAP_DEC(jisx0208, **outbuf,
c ^ 0x80, c2 ^ 0x80) ;
- else return 2;
+ else return 1;
NEXT(2, 1)
}
}
@@ -502,7 +502,7 @@ DECODER(shift_jis)
REQUIRE_INBUF(2)
c2 = IN2;
if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
- return 2;
+ return 1;
c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1);
c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
@@ -522,10 +522,10 @@ DECODER(shift_jis)
continue;
}
else
- return 2;
+ return 1;
}
else
- return 2;
+ return 1;
NEXT(1, 1) /* JIS X 0201 */
}
@@ -645,7 +645,7 @@ DECODER(shift_jis_2004)
REQUIRE_INBUF(2)
c2 = IN2;
if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
- return 2;
+ return 1;
c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1);
c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
@@ -671,7 +671,7 @@ DECODER(shift_jis_2004)
NEXT_OUT(2)
}
else
- return 2;
+ return 1;
NEXT_IN(2)
}
else { /* Plane 2 */
@@ -689,13 +689,13 @@ DECODER(shift_jis_2004)
continue;
}
else
- return 2;
+ return 1;
NEXT(2, 1)
}
continue;
}
else
- return 2;
+ return 1;
NEXT(1, 1) /* JIS X 0201 */
}
diff --git a/Modules/cjkcodecs/_codecs_kr.c b/Modules/cjkcodecs/_codecs_kr.c
index 9272e36..f5697dd 100644
--- a/Modules/cjkcodecs/_codecs_kr.c
+++ b/Modules/cjkcodecs/_codecs_kr.c
@@ -123,7 +123,7 @@ DECODER(euc_kr)
if ((*inbuf)[2] != EUCKR_JAMO_FIRSTBYTE ||
(*inbuf)[4] != EUCKR_JAMO_FIRSTBYTE ||
(*inbuf)[6] != EUCKR_JAMO_FIRSTBYTE)
- return 8;
+ return 1;
c = (*inbuf)[3];
if (0xa1 <= c && c <= 0xbe)
@@ -143,7 +143,7 @@ DECODER(euc_kr)
jong = NONE;
if (cho == NONE || jung == NONE || jong == NONE)
- return 8;
+ return 1;
OUT1(0xac00 + cho*588 + jung*28 + jong);
NEXT(8, 1)
@@ -152,7 +152,7 @@ DECODER(euc_kr)
NEXT(2, 1)
}
else
- return 2;
+ return 1;
}
return 0;
@@ -208,7 +208,7 @@ DECODER(cp949)
REQUIRE_INBUF(2)
TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80);
else TRYMAP_DEC(cp949ext, **outbuf, c, IN2);
- else return 2;
+ else return 1;
NEXT(2, 1)
}
@@ -375,7 +375,7 @@ DECODER(johab)
i_jong = johabidx_jongseong[c_jong];
if (i_cho == NONE || i_jung == NONE || i_jong == NONE)
- return 2;
+ return 1;
/* we don't use U+1100 hangul jamo yet. */
if (i_cho == FILL) {
@@ -391,7 +391,7 @@ DECODER(johab)
OUT1(0x3100 |
johabjamo_jungseong[c_jung])
else
- return 2;
+ return 1;
}
} else {
if (i_jung == FILL) {
@@ -399,7 +399,7 @@ DECODER(johab)
OUT1(0x3100 |
johabjamo_choseong[c_cho])
else
- return 2;
+ return 1;
}
else
OUT1(0xac00 +
@@ -414,7 +414,7 @@ DECODER(johab)
c2 < 0x31 || (c2 >= 0x80 && c2 < 0x91) ||
(c2 & 0x7f) == 0x7f ||
(c == 0xda && (c2 >= 0xa1 && c2 <= 0xd3)))
- return 2;
+ return 1;
else {
unsigned char t1, t2;
@@ -425,7 +425,7 @@ DECODER(johab)
t2 = (t2 < 0x5e ? t2 : t2 - 0x5e) + 0x21;
TRYMAP_DEC(ksx1001, **outbuf, t1, t2);
- else return 2;
+ else return 1;
NEXT(2, 1)
}
}
diff --git a/Modules/cjkcodecs/_codecs_tw.c b/Modules/cjkcodecs/_codecs_tw.c
index 38cf723..916298d 100644
--- a/Modules/cjkcodecs/_codecs_tw.c
+++ b/Modules/cjkcodecs/_codecs_tw.c
@@ -55,7 +55,7 @@ DECODER(big5)
TRYMAP_DEC(big5, **outbuf, c, IN2) {
NEXT(2, 1)
}
- else return 2;
+ else return 1;
}
return 0;
@@ -109,7 +109,7 @@ DECODER(cp950)
TRYMAP_DEC(cp950ext, **outbuf, c, IN2);
else TRYMAP_DEC(big5, **outbuf, c, IN2);
- else return 2;
+ else return 1;
NEXT(2, 1)
}