diff options
author | Victor Stinner <victor.stinner@gmail.com> | 2013-04-14 00:06:32 (GMT) |
---|---|---|
committer | Victor Stinner <victor.stinner@gmail.com> | 2013-04-14 00:06:32 (GMT) |
commit | d949126995a7ff63590285aa816da65d97a31403 (patch) | |
tree | 5fae129dff24d0a40c415b2a8d1559ff613aceea | |
parent | 71557596b26e9c899e83adc99659732097097c4e (diff) | |
download | cpython-d949126995a7ff63590285aa816da65d97a31403.zip cpython-d949126995a7ff63590285aa816da65d97a31403.tar.gz cpython-d949126995a7ff63590285aa816da65d97a31403.tar.bz2 |
Issue #17693: CJK encoders now use the new Unicode API (PEP 393)
-rw-r--r-- | Modules/cjkcodecs/_codecs_cn.c | 135 | ||||
-rw-r--r-- | Modules/cjkcodecs/_codecs_hk.c | 44 | ||||
-rw-r--r-- | Modules/cjkcodecs/_codecs_iso2022.c | 113 | ||||
-rw-r--r-- | Modules/cjkcodecs/_codecs_jp.c | 168 | ||||
-rw-r--r-- | Modules/cjkcodecs/_codecs_kr.c | 98 | ||||
-rw-r--r-- | Modules/cjkcodecs/_codecs_tw.c | 44 | ||||
-rw-r--r-- | Modules/cjkcodecs/cjkcodecs.h | 68 | ||||
-rw-r--r-- | Modules/cjkcodecs/multibytecodec.c | 172 | ||||
-rw-r--r-- | Modules/cjkcodecs/multibytecodec.h | 6 |
9 files changed, 430 insertions, 418 deletions
diff --git a/Modules/cjkcodecs/_codecs_cn.c b/Modules/cjkcodecs/_codecs_cn.c index ba6b4ee..285da1e 100644 --- a/Modules/cjkcodecs/_codecs_cn.c +++ b/Modules/cjkcodecs/_codecs_cn.c @@ -42,16 +42,18 @@ ENCODER(gb2312) { - while (inleft > 0) { - Py_UCS4 c = IN1; + while (*inpos < inlen) { + Py_UCS4 c = INCHAR1; DBCHAR code; if (c < 0x80) { - WRITE1((unsigned char)c) - NEXT(1, 1) + WRITEBYTE1((unsigned char)c) + NEXT(1, 1); continue; } - UCS4INVALID(c) + + if (c > 0xFFFF) + return 1; REQUIRE_OUTBUF(2) TRYMAP_ENC(gbcommon, code, c); @@ -60,9 +62,9 @@ ENCODER(gb2312) if (code & 0x8000) /* MSB set: GBK */ return 1; - OUT1((code >> 8) | 0x80) - OUT2((code & 0xFF) | 0x80) - NEXT(1, 2) + OUTBYTE1((code >> 8) | 0x80) + OUTBYTE2((code & 0xFF) | 0x80) + NEXT(1, 2); } return 0; @@ -80,7 +82,7 @@ DECODER(gb2312) } REQUIRE_INBUF(2) - TRYMAP_DEC(gb2312, writer, c ^ 0x80, IN2 ^ 0x80) { + TRYMAP_DEC(gb2312, writer, c ^ 0x80, INBYTE2 ^ 0x80) { NEXT_IN(2); } else return 1; @@ -96,28 +98,30 @@ DECODER(gb2312) ENCODER(gbk) { - while (inleft > 0) { - Py_UCS4 c = IN1; + while (*inpos < inlen) { + Py_UCS4 c = INCHAR1; DBCHAR code; if (c < 0x80) { - WRITE1((unsigned char)c) - NEXT(1, 1) + WRITEBYTE1((unsigned char)c) + NEXT(1, 1); continue; } - UCS4INVALID(c) + + if (c > 0xFFFF) + return 1; REQUIRE_OUTBUF(2) GBK_ENCODE(c, code) else return 1; - OUT1((code >> 8) | 0x80) + OUTBYTE1((code >> 8) | 0x80) if (code & 0x8000) - OUT2((code & 0xFF)) /* MSB set: GBK */ + OUTBYTE2((code & 0xFF)) /* MSB set: GBK */ else - OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */ - NEXT(1, 2) + OUTBYTE2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */ + NEXT(1, 2); } return 0; @@ -126,7 +130,7 @@ ENCODER(gbk) DECODER(gbk) { while (inleft > 0) { - unsigned char c = IN1; + unsigned char c = INBYTE1; if (c < 0x80) { OUTCHAR(c); @@ -136,7 +140,7 @@ DECODER(gbk) REQUIRE_INBUF(2) - GBK_DECODE(c, IN2, writer) + GBK_DECODE(c, INBYTE2, writer) else return 1; NEXT_IN(2); @@ -152,41 +156,31 @@ DECODER(gbk) ENCODER(gb18030) { - while (inleft > 0) { - Py_UCS4 c = IN1; + while (*inpos < inlen) { + Py_UCS4 c = INCHAR1; DBCHAR code; if (c < 0x80) { - WRITE1(c) - NEXT(1, 1) + WRITEBYTE1(c) + NEXT(1, 1); continue; } - DECODE_SURROGATE(c) - if (c > 0x10FFFF) -#if Py_UNICODE_SIZE == 2 - return 2; /* surrogates pair */ -#else - return 1; -#endif - else if (c >= 0x10000) { + if (c >= 0x10000) { Py_UCS4 tc = c - 0x10000; + assert (c <= 0x10FFFF); REQUIRE_OUTBUF(4) - OUT4((unsigned char)(tc % 10) + 0x30) + OUTBYTE4((unsigned char)(tc % 10) + 0x30) tc /= 10; - OUT3((unsigned char)(tc % 126) + 0x81) + OUTBYTE3((unsigned char)(tc % 126) + 0x81) tc /= 126; - OUT2((unsigned char)(tc % 10) + 0x30) + OUTBYTE2((unsigned char)(tc % 10) + 0x30) tc /= 10; - OUT1((unsigned char)(tc + 0x90)) + OUTBYTE1((unsigned char)(tc + 0x90)) -#if Py_UNICODE_SIZE == 2 - NEXT(2, 4) /* surrogates pair */ -#else - NEXT(1, 4) -#endif + NEXT(1, 4); continue; } @@ -209,15 +203,15 @@ ENCODER(gb18030) tc = c - utrrange->first + utrrange->base; - OUT4((unsigned char)(tc % 10) + 0x30) + OUTBYTE4((unsigned char)(tc % 10) + 0x30) tc /= 10; - OUT3((unsigned char)(tc % 126) + 0x81) + OUTBYTE3((unsigned char)(tc % 126) + 0x81) tc /= 126; - OUT2((unsigned char)(tc % 10) + 0x30) + OUTBYTE2((unsigned char)(tc % 10) + 0x30) tc /= 10; - OUT1((unsigned char)tc + 0x81) + OUTBYTE1((unsigned char)tc + 0x81) - NEXT(1, 4) + NEXT(1, 4); break; } @@ -226,13 +220,13 @@ ENCODER(gb18030) continue; } - OUT1((code >> 8) | 0x80) + OUTBYTE1((code >> 8) | 0x80) if (code & 0x8000) - OUT2((code & 0xFF)) /* MSB set: GBK or GB18030ext */ + OUTBYTE2((code & 0xFF)) /* MSB set: GBK or GB18030ext */ else - OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */ + OUTBYTE2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */ - NEXT(1, 2) + NEXT(1, 2); } return 0; @@ -241,7 +235,7 @@ ENCODER(gb18030) DECODER(gb18030) { while (inleft > 0) { - unsigned char c = IN1, c2; + unsigned char c = INBYTE1, c2; if (c < 0x80) { OUTCHAR(c); @@ -251,15 +245,15 @@ DECODER(gb18030) REQUIRE_INBUF(2) - c2 = IN2; + c2 = INBYTE2; if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */ const struct _gb18030_to_unibmp_ranges *utr; unsigned char c3, c4; Py_UCS4 lseq; REQUIRE_INBUF(4) - c3 = IN3; - c4 = IN4; + c3 = INBYTE3; + c4 = INBYTE4; if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39) return 1; c -= 0x81; c2 -= 0x30; @@ -313,33 +307,34 @@ ENCODER_INIT(hz) ENCODER_RESET(hz) { if (state->i != 0) { - WRITE2('~', '}') + WRITEBYTE2('~', '}') state->i = 0; - NEXT_OUT(2) + NEXT_OUT(2); } return 0; } ENCODER(hz) { - while (inleft > 0) { - Py_UCS4 c = IN1; + while (*inpos < inlen) { + Py_UCS4 c = INCHAR1; DBCHAR code; if (c < 0x80) { if (state->i == 0) { - WRITE1((unsigned char)c) - NEXT(1, 1) + WRITEBYTE1((unsigned char)c) + NEXT(1, 1); } else { - WRITE3('~', '}', (unsigned char)c) - NEXT(1, 3) + WRITEBYTE3('~', '}', (unsigned char)c) + NEXT(1, 3); state->i = 0; } continue; } - UCS4INVALID(c) + if (c > 0xFFFF) + return 1; TRYMAP_ENC(gbcommon, code, c); else return 1; @@ -348,13 +343,13 @@ ENCODER(hz) return 1; if (state->i == 0) { - WRITE4('~', '{', code >> 8, code & 0xff) - NEXT(1, 4) + WRITEBYTE4('~', '{', code >> 8, code & 0xff) + NEXT(1, 4); state->i = 1; } else { - WRITE2(code >> 8, code & 0xff) - NEXT(1, 2) + WRITEBYTE2(code >> 8, code & 0xff) + NEXT(1, 2); } } @@ -376,10 +371,10 @@ DECODER_RESET(hz) DECODER(hz) { while (inleft > 0) { - unsigned char c = IN1; + unsigned char c = INBYTE1; if (c == '~') { - unsigned char c2 = IN2; + unsigned char c2 = INBYTE2; REQUIRE_INBUF(2) if (c2 == '~') { @@ -408,7 +403,7 @@ DECODER(hz) } else { /* GB mode */ REQUIRE_INBUF(2) - TRYMAP_DEC(gb2312, writer, c, IN2) { + TRYMAP_DEC(gb2312, writer, c, INBYTE2) { NEXT_IN(2); } else diff --git a/Modules/cjkcodecs/_codecs_hk.c b/Modules/cjkcodecs/_codecs_hk.c index e31664b..fe5f597 100644 --- a/Modules/cjkcodecs/_codecs_hk.c +++ b/Modules/cjkcodecs/_codecs_hk.c @@ -38,35 +38,39 @@ static const DBCHAR big5hkscs_pairenc_table[4] = {0x8862, 0x8864, 0x88a3, 0x88a5 ENCODER(big5hkscs) { - while (inleft > 0) { - Py_UCS4 c = **inbuf; + while (*inpos < inlen) { + Py_UCS4 c = INCHAR1; DBCHAR code; Py_ssize_t insize; if (c < 0x80) { REQUIRE_OUTBUF(1) **outbuf = (unsigned char)c; - NEXT(1, 1) + NEXT(1, 1); continue; } - DECODE_SURROGATE(c) - insize = GET_INSIZE(c); - + insize = 1; REQUIRE_OUTBUF(2) if (c < 0x10000) { TRYMAP_ENC(big5hkscs_bmp, code, c) { if (code == MULTIC) { - if (inleft >= 2 && + Py_UCS4 c2; + if (inlen - *inpos >= 2) + c2 = INCHAR2; + else + c2 = 0; + + if (inlen - *inpos >= 2 && ((c & 0xffdf) == 0x00ca) && - (((*inbuf)[1] & 0xfff7) == 0x0304)) { + ((c2 & 0xfff7) == 0x0304)) { code = big5hkscs_pairenc_table[ ((c >> 4) | - ((*inbuf)[1] >> 3)) & 3]; + (c2 >> 3)) & 3]; insize = 2; } - else if (inleft < 2 && + else if (inlen - *inpos < 2 && !(flags & MBENC_FLUSH)) return MBERR_TOOFEW; else { @@ -89,9 +93,9 @@ ENCODER(big5hkscs) else return insize; - OUT1(code >> 8) - OUT2(code & 0xFF) - NEXT(insize, 2) + OUTBYTE1(code >> 8) + OUTBYTE2(code & 0xFF) + NEXT(insize, 2); } return 0; @@ -102,7 +106,7 @@ ENCODER(big5hkscs) DECODER(big5hkscs) { while (inleft > 0) { - unsigned char c = IN1; + unsigned char c = INBYTE1; Py_UCS4 decoded; if (c < 0x80) { @@ -113,20 +117,20 @@ DECODER(big5hkscs) REQUIRE_INBUF(2) - if (0xc6 > c || c > 0xc8 || (c < 0xc7 && IN2 < 0xa1)) { - TRYMAP_DEC(big5, writer, c, IN2) { + if (0xc6 > c || c > 0xc8 || (c < 0xc7 && INBYTE2 < 0xa1)) { + TRYMAP_DEC(big5, writer, c, INBYTE2) { NEXT_IN(2); continue; } } - TRYMAP_DEC_CHAR(big5hkscs, decoded, c, IN2) + TRYMAP_DEC_CHAR(big5hkscs, decoded, c, INBYTE2) { - int s = BH2S(c, IN2); + int s = BH2S(c, INBYTE2); const unsigned char *hintbase; assert(0x87 <= c && c <= 0xfe); - assert(0x40 <= IN2 && IN2 <= 0xfe); + assert(0x40 <= INBYTE2 && INBYTE2 <= 0xfe); if (BH2S(0x87, 0x40) <= s && s <= BH2S(0xa0, 0xfe)) { hintbase = big5hkscs_phint_0; @@ -154,7 +158,7 @@ DECODER(big5hkscs) continue; } - switch ((c << 8) | IN2) { + switch ((c << 8) | INBYTE2) { case 0x8862: OUTCHAR2(0x00ca, 0x0304); break; case 0x8864: OUTCHAR2(0x00ca, 0x030c); break; case 0x88a3: OUTCHAR2(0x00ea, 0x0304); break; diff --git a/Modules/cjkcodecs/_codecs_iso2022.c b/Modules/cjkcodecs/_codecs_iso2022.c index ae14677..bb63835 100644 --- a/Modules/cjkcodecs/_codecs_iso2022.c +++ b/Modules/cjkcodecs/_codecs_iso2022.c @@ -141,13 +141,13 @@ ENCODER_INIT(iso2022) ENCODER_RESET(iso2022) { if (STATE_GETFLAG(F_SHIFTED)) { - WRITE1(SI) - NEXT_OUT(1) + WRITEBYTE1(SI) + NEXT_OUT(1); STATE_CLEARFLAG(F_SHIFTED) } if (STATE_G0 != CHARSET_ASCII) { - WRITE3(ESC, '(', 'B') - NEXT_OUT(3) + WRITEBYTE3(ESC, '(', 'B') + NEXT_OUT(3); STATE_SETG0(CHARSET_ASCII) } return 0; @@ -155,30 +155,29 @@ ENCODER_RESET(iso2022) ENCODER(iso2022) { - while (inleft > 0) { + while (*inpos < inlen) { const struct iso2022_designation *dsg; DBCHAR encoded; - Py_UCS4 c = **inbuf; + Py_UCS4 c = INCHAR1; Py_ssize_t insize; if (c < 0x80) { if (STATE_G0 != CHARSET_ASCII) { - WRITE3(ESC, '(', 'B') + WRITEBYTE3(ESC, '(', 'B') STATE_SETG0(CHARSET_ASCII) - NEXT_OUT(3) + NEXT_OUT(3); } if (STATE_GETFLAG(F_SHIFTED)) { - WRITE1(SI) + WRITEBYTE1(SI) STATE_CLEARFLAG(F_SHIFTED) - NEXT_OUT(1) + NEXT_OUT(1); } - WRITE1((unsigned char)c) - NEXT(1, 1) + WRITEBYTE1((unsigned char)c) + NEXT(1, 1); continue; } - DECODE_SURROGATE(c) - insize = GET_INSIZE(c); + insize = 1; encoded = MAP_UNMAPPABLE; for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) { @@ -187,24 +186,14 @@ ENCODER(iso2022) if (encoded == MAP_MULTIPLE_AVAIL) { /* this implementation won't work for pair * of non-bmp characters. */ - if (inleft < 2) { + if (inlen - *inpos < 2) { if (!(flags & MBENC_FLUSH)) return MBERR_TOOFEW; length = -1; } else length = 2; -#if Py_UNICODE_SIZE == 2 - if (length == 2) { - Py_UCS4 u4in[2]; - u4in[0] = (Py_UCS4)IN1; - u4in[1] = (Py_UCS4)IN2; - encoded = dsg->encoder(u4in, &length); - } else - encoded = dsg->encoder(&c, &length); -#else encoded = dsg->encoder(&c, &length); -#endif if (encoded != MAP_UNMAPPABLE) { insize = length; break; @@ -221,47 +210,47 @@ ENCODER(iso2022) switch (dsg->plane) { case 0: /* G0 */ if (STATE_GETFLAG(F_SHIFTED)) { - WRITE1(SI) + WRITEBYTE1(SI) STATE_CLEARFLAG(F_SHIFTED) - NEXT_OUT(1) + NEXT_OUT(1); } if (STATE_G0 != dsg->mark) { if (dsg->width == 1) { - WRITE3(ESC, '(', ESCMARK(dsg->mark)) + WRITEBYTE3(ESC, '(', ESCMARK(dsg->mark)) STATE_SETG0(dsg->mark) - NEXT_OUT(3) + NEXT_OUT(3); } else if (dsg->mark == CHARSET_JISX0208) { - WRITE3(ESC, '$', ESCMARK(dsg->mark)) + WRITEBYTE3(ESC, '$', ESCMARK(dsg->mark)) STATE_SETG0(dsg->mark) - NEXT_OUT(3) + NEXT_OUT(3); } else { - WRITE4(ESC, '$', '(', + WRITEBYTE4(ESC, '$', '(', ESCMARK(dsg->mark)) STATE_SETG0(dsg->mark) - NEXT_OUT(4) + NEXT_OUT(4); } } break; case 1: /* G1 */ if (STATE_G1 != dsg->mark) { if (dsg->width == 1) { - WRITE3(ESC, ')', ESCMARK(dsg->mark)) + WRITEBYTE3(ESC, ')', ESCMARK(dsg->mark)) STATE_SETG1(dsg->mark) - NEXT_OUT(3) + NEXT_OUT(3); } else { - WRITE4(ESC, '$', ')', + WRITEBYTE4(ESC, '$', ')', ESCMARK(dsg->mark)) STATE_SETG1(dsg->mark) - NEXT_OUT(4) + NEXT_OUT(4); } } if (!STATE_GETFLAG(F_SHIFTED)) { - WRITE1(SO) + WRITEBYTE1(SO) STATE_SETFLAG(F_SHIFTED) - NEXT_OUT(1) + NEXT_OUT(1); } break; default: /* G2 and G3 is not supported: no encoding in @@ -270,14 +259,14 @@ ENCODER(iso2022) } if (dsg->width == 1) { - WRITE1((unsigned char)encoded) - NEXT_OUT(1) + WRITEBYTE1((unsigned char)encoded) + NEXT_OUT(1); } else { - WRITE2(encoded >> 8, encoded & 0xff) - NEXT_OUT(2) + WRITEBYTE2(encoded >> 8, encoded & 0xff) + NEXT_OUT(2); } - NEXT_IN(insize); + NEXT_INCHAR(insize); } return 0; @@ -323,26 +312,26 @@ iso2022processesc(const void *config, MultibyteCodec_State *state, switch (esclen) { case 3: - if (IN2 == '$') { - charset = IN3 | CHARSET_DBCS; + if (INBYTE2 == '$') { + charset = INBYTE3 | CHARSET_DBCS; designation = 0; } else { - charset = IN3; - if (IN2 == '(') designation = 0; - else if (IN2 == ')') designation = 1; - else if (CONFIG_ISSET(USE_G2) && IN2 == '.') + charset = INBYTE3; + if (INBYTE2 == '(') designation = 0; + else if (INBYTE2 == ')') designation = 1; + else if (CONFIG_ISSET(USE_G2) && INBYTE2 == '.') designation = 2; else return 3; } break; case 4: - if (IN2 != '$') + if (INBYTE2 != '$') return 4; - charset = IN4 | CHARSET_DBCS; - if (IN3 == '(') designation = 0; - else if (IN3 == ')') designation = 1; + charset = INBYTE4 | CHARSET_DBCS; + if (INBYTE3 == '(') designation = 0; + else if (INBYTE3 == ')') designation = 1; else return 4; break; case 6: /* designation with prefix */ @@ -395,18 +384,18 @@ iso2022processg2(const void *config, MultibyteCodec_State *state, /* not written to use encoder, decoder functions because only few * encodings use G2 designations in CJKCodecs */ if (STATE_G2 == CHARSET_ISO8859_1) { - if (IN3 < 0x80) - OUTCHAR(IN3 + 0x80); + if (INBYTE3 < 0x80) + OUTCHAR(INBYTE3 + 0x80); else return 3; } else if (STATE_G2 == CHARSET_ISO8859_7) { - ISO8859_7_DECODE(IN3 ^ 0x80, writer) + ISO8859_7_DECODE(INBYTE3 ^ 0x80, writer) else return 3; } else if (STATE_G2 == CHARSET_ASCII) { - if (IN3 & 0x80) return 3; - else OUTCHAR(IN3); + if (INBYTE3 & 0x80) return 3; + else OUTCHAR(INBYTE3); } else return MBERR_INTERNAL; @@ -421,7 +410,7 @@ DECODER(iso2022) const struct iso2022_designation *dsgcache = NULL; while (inleft > 0) { - unsigned char c = IN1; + unsigned char c = INBYTE1; Py_ssize_t err; if (STATE_GETFLAG(F_ESCTHROUGHOUT)) { @@ -438,13 +427,13 @@ DECODER(iso2022) switch (c) { case ESC: REQUIRE_INBUF(2) - if (IS_ISO2022ESC(IN2)) { + if (IS_ISO2022ESC(INBYTE2)) { err = iso2022processesc(config, state, inbuf, &inleft); if (err != 0) return err; } - else if (CONFIG_ISSET(USE_G2) && IN2 == 'N') {/* SS2 */ + else if (CONFIG_ISSET(USE_G2) && INBYTE2 == 'N') {/* SS2 */ REQUIRE_INBUF(3) err = iso2022processg2(config, state, inbuf, &inleft, writer); diff --git a/Modules/cjkcodecs/_codecs_jp.c b/Modules/cjkcodecs/_codecs_jp.c index 8bfb813..7ab318b 100644 --- a/Modules/cjkcodecs/_codecs_jp.c +++ b/Modules/cjkcodecs/_codecs_jp.c @@ -19,38 +19,39 @@ ENCODER(cp932) { - while (inleft > 0) { - Py_UCS4 c = IN1; + while (*inpos < inlen) { + Py_UCS4 c = INCHAR1; DBCHAR code; unsigned char c1, c2; if (c <= 0x80) { - WRITE1((unsigned char)c) - NEXT(1, 1) + WRITEBYTE1((unsigned char)c) + NEXT(1, 1); continue; } else if (c >= 0xff61 && c <= 0xff9f) { - WRITE1(c - 0xfec0) - NEXT(1, 1) + WRITEBYTE1(c - 0xfec0) + NEXT(1, 1); continue; } else if (c >= 0xf8f0 && c <= 0xf8f3) { /* Windows compatibility */ REQUIRE_OUTBUF(1) if (c == 0xf8f0) - OUT1(0xa0) + OUTBYTE1(0xa0) else - OUT1(c - 0xfef1 + 0xfd) - NEXT(1, 1) + OUTBYTE1(c - 0xfef1 + 0xfd) + NEXT(1, 1); continue; } - UCS4INVALID(c) + if (c > 0xFFFF) + return 1; REQUIRE_OUTBUF(2) TRYMAP_ENC(cp932ext, code, c) { - OUT1(code >> 8) - OUT2(code & 0xff) + OUTBYTE1(code >> 8) + OUTBYTE2(code & 0xff) } else TRYMAP_ENC(jisxcommon, code, c) { if (code & 0x8000) /* MSB set: JIS X 0212 */ @@ -61,20 +62,20 @@ ENCODER(cp932) c2 = code & 0xff; c2 = (((c1 - 0x21) & 1) ? 0x5e : 0) + (c2 - 0x21); c1 = (c1 - 0x21) >> 1; - OUT1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1) - OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41) + OUTBYTE1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1) + OUTBYTE2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41) } else if (c >= 0xe000 && c < 0xe758) { /* User-defined area */ c1 = (Py_UCS4)(c - 0xe000) / 188; c2 = (Py_UCS4)(c - 0xe000) % 188; - OUT1(c1 + 0xf0) - OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41) + OUTBYTE1(c1 + 0xf0) + OUTBYTE2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41) } else return 1; - NEXT(1, 2) + NEXT(1, 2); } return 0; @@ -83,7 +84,7 @@ ENCODER(cp932) DECODER(cp932) { while (inleft > 0) { - unsigned char c = IN1, c2; + unsigned char c = INBYTE1, c2; if (c <= 0x80) { OUTCHAR(c); @@ -106,7 +107,7 @@ DECODER(cp932) } REQUIRE_INBUF(2) - c2 = IN2; + c2 = INBYTE2; TRYMAP_DEC(cp932ext, writer, c, c2); else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){ @@ -145,25 +146,24 @@ DECODER(cp932) ENCODER(euc_jis_2004) { - while (inleft > 0) { - Py_UCS4 c = IN1; + while (*inpos < inlen) { + Py_UCS4 c = INCHAR1; DBCHAR code; Py_ssize_t insize; if (c < 0x80) { - WRITE1(c) - NEXT(1, 1) + WRITEBYTE1(c) + NEXT(1, 1); continue; } - DECODE_SURROGATE(c) - insize = GET_INSIZE(c); + insize = 1; if (c <= 0xFFFF) { EMULATE_JISX0213_2000_ENCODE_BMP(code, c) else TRYMAP_ENC(jisx0213_bmp, code, c) { if (code == MULTIC) { - if (inleft < 2) { + if (inlen - *inpos < 2) { if (flags & MBENC_FLUSH) { code = find_pairencmap( (ucs2_t)c, 0, @@ -176,8 +176,9 @@ ENCODER(euc_jis_2004) return MBERR_TOOFEW; } else { + Py_UCS4 c2 = INCHAR2; code = find_pairencmap( - (ucs2_t)c, (*inbuf)[1], + (ucs2_t)c, c2, jisx0213_pair_encmap, JISX0213_ENCPAIRS); if (code == DBCINV) { @@ -195,8 +196,8 @@ ENCODER(euc_jis_2004) else TRYMAP_ENC(jisxcommon, code, c); else if (c >= 0xff61 && c <= 0xff9f) { /* JIS X 0201 half-width katakana */ - WRITE2(0x8e, c - 0xfec0) - NEXT(1, 2) + WRITEBYTE2(0x8e, c - 0xfec0) + NEXT(1, 2); continue; } else if (c == 0xff3c) @@ -218,12 +219,12 @@ ENCODER(euc_jis_2004) if (code & 0x8000) { /* Codeset 2 */ - WRITE3(0x8f, code >> 8, (code & 0xFF) | 0x80) - NEXT(insize, 3) + WRITEBYTE3(0x8f, code >> 8, (code & 0xFF) | 0x80) + NEXT(insize, 3); } else { /* Codeset 1 */ - WRITE2((code >> 8) | 0x80, (code & 0xFF) | 0x80) - NEXT(insize, 2) + WRITEBYTE2((code >> 8) | 0x80, (code & 0xFF) | 0x80) + NEXT(insize, 2); } } @@ -233,7 +234,7 @@ ENCODER(euc_jis_2004) DECODER(euc_jis_2004) { while (inleft > 0) { - unsigned char c = IN1; + unsigned char c = INBYTE1; Py_UCS4 code; if (c < 0x80) { @@ -247,7 +248,7 @@ DECODER(euc_jis_2004) unsigned char c2; REQUIRE_INBUF(2) - c2 = IN2; + c2 = INBYTE2; if (c2 >= 0xa1 && c2 <= 0xdf) { OUTCHAR(0xfec0 + c2); NEXT_IN(2); @@ -259,8 +260,8 @@ DECODER(euc_jis_2004) unsigned char c2, c3; REQUIRE_INBUF(3) - c2 = IN2 ^ 0x80; - c3 = IN3 ^ 0x80; + c2 = INBYTE2 ^ 0x80; + c3 = INBYTE3 ^ 0x80; /* JIS X 0213 Plane 2 or JIS X 0212 (see NOTES) */ EMULATE_JISX0213_2000_DECODE_PLANE2(writer, c2, c3) @@ -279,7 +280,7 @@ DECODER(euc_jis_2004) REQUIRE_INBUF(2) c ^= 0x80; - c2 = IN2 ^ 0x80; + c2 = INBYTE2 ^ 0x80; /* JIS X 0213 Plane 1 */ EMULATE_JISX0213_2000_DECODE_PLANE1(writer, c, c2) @@ -312,35 +313,36 @@ DECODER(euc_jis_2004) ENCODER(euc_jp) { - while (inleft > 0) { - Py_UCS4 c = IN1; + while (*inpos < inlen) { + Py_UCS4 c = INCHAR1; DBCHAR code; if (c < 0x80) { - WRITE1((unsigned char)c) - NEXT(1, 1) + WRITEBYTE1((unsigned char)c) + NEXT(1, 1); continue; } - UCS4INVALID(c) + if (c > 0xFFFF) + return 1; TRYMAP_ENC(jisxcommon, code, c); else if (c >= 0xff61 && c <= 0xff9f) { /* JIS X 0201 half-width katakana */ - WRITE2(0x8e, c - 0xfec0) - NEXT(1, 2) + WRITEBYTE2(0x8e, c - 0xfec0) + NEXT(1, 2); continue; } #ifndef STRICT_BUILD else if (c == 0xff3c) /* FULL-WIDTH REVERSE SOLIDUS */ code = 0x2140; else if (c == 0xa5) { /* YEN SIGN */ - WRITE1(0x5c); - NEXT(1, 1) + WRITEBYTE1(0x5c); + NEXT(1, 1); continue; } else if (c == 0x203e) { /* OVERLINE */ - WRITE1(0x7e); - NEXT(1, 1) + WRITEBYTE1(0x7e); + NEXT(1, 1); continue; } #endif @@ -349,12 +351,12 @@ ENCODER(euc_jp) if (code & 0x8000) { /* JIS X 0212 */ - WRITE3(0x8f, code >> 8, (code & 0xFF) | 0x80) - NEXT(1, 3) + WRITEBYTE3(0x8f, code >> 8, (code & 0xFF) | 0x80) + NEXT(1, 3); } else { /* JIS X 0208 */ - WRITE2((code >> 8) | 0x80, (code & 0xFF) | 0x80) - NEXT(1, 2) + WRITEBYTE2((code >> 8) | 0x80, (code & 0xFF) | 0x80) + NEXT(1, 2); } } @@ -364,7 +366,7 @@ ENCODER(euc_jp) DECODER(euc_jp) { while (inleft > 0) { - unsigned char c = IN1; + unsigned char c = INBYTE1; if (c < 0x80) { OUTCHAR(c); @@ -377,7 +379,7 @@ DECODER(euc_jp) unsigned char c2; REQUIRE_INBUF(2) - c2 = IN2; + c2 = INBYTE2; if (c2 >= 0xa1 && c2 <= 0xdf) { OUTCHAR(0xfec0 + c2); NEXT_IN(2); @@ -389,8 +391,8 @@ DECODER(euc_jp) unsigned char c2, c3; REQUIRE_INBUF(3) - c2 = IN2; - c3 = IN3; + c2 = INBYTE2; + c3 = INBYTE3; /* JIS X 0212 */ TRYMAP_DEC(jisx0212, writer, c2 ^ 0x80, c3 ^ 0x80) { NEXT_IN(3); @@ -402,7 +404,7 @@ DECODER(euc_jp) unsigned char c2; REQUIRE_INBUF(2) - c2 = IN2; + c2 = INBYTE2; /* JIS X 0208 */ #ifndef STRICT_BUILD if (c == 0xa1 && c2 == 0xc0) @@ -427,8 +429,8 @@ DECODER(euc_jp) ENCODER(shift_jis) { - while (inleft > 0) { - Py_UCS4 c = IN1; + while (*inpos < inlen) { + Py_UCS4 c = INCHAR1; DBCHAR code; unsigned char c1, c2; @@ -440,14 +442,16 @@ ENCODER(shift_jis) else if (c == 0x203e) code = 0x7e; /* OVERLINE */ #endif else JISX0201_K_ENCODE(c, code) - else UCS4INVALID(c) - else code = NOCHAR; + else if (c > 0xFFFF) + return 1; + else + code = NOCHAR; if (code < 0x80 || (code >= 0xa1 && code <= 0xdf)) { REQUIRE_OUTBUF(1) - OUT1((unsigned char)code) - NEXT(1, 1) + OUTBYTE1((unsigned char)code) + NEXT(1, 1); continue; } @@ -470,9 +474,9 @@ ENCODER(shift_jis) c2 = code & 0xff; c2 = (((c1 - 0x21) & 1) ? 0x5e : 0) + (c2 - 0x21); c1 = (c1 - 0x21) >> 1; - OUT1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1) - OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41) - NEXT(1, 2) + OUTBYTE1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1) + OUTBYTE2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41) + NEXT(1, 2); } return 0; @@ -481,7 +485,7 @@ ENCODER(shift_jis) DECODER(shift_jis) { while (inleft > 0) { - unsigned char c = IN1; + unsigned char c = INBYTE1; #ifdef STRICT_BUILD JISX0201_R_DECODE(c, writer) @@ -493,7 +497,7 @@ DECODER(shift_jis) unsigned char c1, c2; REQUIRE_INBUF(2) - c2 = IN2; + c2 = INBYTE2; if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc) return 1; @@ -533,30 +537,29 @@ DECODER(shift_jis) ENCODER(shift_jis_2004) { - while (inleft > 0) { - Py_UCS4 c = IN1; + while (*inpos < inlen) { + Py_UCS4 c = INCHAR1; DBCHAR code = NOCHAR; int c1, c2; Py_ssize_t insize; JISX0201_ENCODE(c, code) - else DECODE_SURROGATE(c) if (code < 0x80 || (code >= 0xa1 && code <= 0xdf)) { - WRITE1((unsigned char)code) - NEXT(1, 1) + WRITEBYTE1((unsigned char)code) + NEXT(1, 1); continue; } REQUIRE_OUTBUF(2) - insize = GET_INSIZE(c); + insize = 1; if (code == NOCHAR) { if (c <= 0xffff) { EMULATE_JISX0213_2000_ENCODE_BMP(code, c) else TRYMAP_ENC(jisx0213_bmp, code, c) { if (code == MULTIC) { - if (inleft < 2) { + if (inlen - *inpos < 2) { if (flags & MBENC_FLUSH) { code = find_pairencmap ((ucs2_t)c, 0, @@ -569,8 +572,9 @@ ENCODER(shift_jis_2004) return MBERR_TOOFEW; } else { + Py_UCS4 ch2 = INCHAR2; code = find_pairencmap( - (ucs2_t)c, IN2, + (ucs2_t)c, ch2, jisx0213_pair_encmap, JISX0213_ENCPAIRS); if (code == DBCINV) { @@ -615,10 +619,10 @@ ENCODER(shift_jis_2004) if (c1 & 1) c2 += 0x5e; c1 >>= 1; - OUT1(c1 + (c1 < 0x1f ? 0x81 : 0xc1)) - OUT2(c2 + (c2 < 0x3f ? 0x40 : 0x41)) + OUTBYTE1(c1 + (c1 < 0x1f ? 0x81 : 0xc1)) + OUTBYTE2(c2 + (c2 < 0x3f ? 0x40 : 0x41)) - NEXT(insize, 2) + NEXT(insize, 2); } return 0; @@ -627,7 +631,7 @@ ENCODER(shift_jis_2004) DECODER(shift_jis_2004) { while (inleft > 0) { - unsigned char c = IN1; + unsigned char c = INBYTE1; JISX0201_DECODE(c, writer) else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xfc)){ @@ -635,7 +639,7 @@ DECODER(shift_jis_2004) Py_UCS4 code; REQUIRE_INBUF(2) - c2 = IN2; + c2 = INBYTE2; if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc) return 1; diff --git a/Modules/cjkcodecs/_codecs_kr.c b/Modules/cjkcodecs/_codecs_kr.c index ca63ee5..0c2309d 100644 --- a/Modules/cjkcodecs/_codecs_kr.c +++ b/Modules/cjkcodecs/_codecs_kr.c @@ -33,16 +33,18 @@ static const unsigned char u2cgk_jongseong[28] = { ENCODER(euc_kr) { - while (inleft > 0) { - Py_UCS4 c = IN1; + while (*inpos < inlen) { + Py_UCS4 c = INCHAR1; DBCHAR code; if (c < 0x80) { - WRITE1((unsigned char)c) - NEXT(1, 1) + WRITEBYTE1((unsigned char)c) + NEXT(1, 1); continue; } - UCS4INVALID(c) + + if (c > 0xFFFF) + return 1; REQUIRE_OUTBUF(2) TRYMAP_ENC(cp949, code, c); @@ -50,9 +52,9 @@ ENCODER(euc_kr) if ((code & 0x8000) == 0) { /* KS X 1001 coded character */ - OUT1((code >> 8) | 0x80) - OUT2((code & 0xFF) | 0x80) - NEXT(1, 2) + OUTBYTE1((code >> 8) | 0x80) + OUTBYTE2((code & 0xFF) | 0x80) + NEXT(1, 2); } else { /* Mapping is found in CP949 extension, * but we encode it in KS X 1001:1998 Annex 3, @@ -61,23 +63,23 @@ ENCODER(euc_kr) REQUIRE_OUTBUF(8) /* syllable composition precedence */ - OUT1(EUCKR_JAMO_FIRSTBYTE) - OUT2(EUCKR_JAMO_FILLER) + OUTBYTE1(EUCKR_JAMO_FIRSTBYTE) + OUTBYTE2(EUCKR_JAMO_FILLER) /* All codepoints in CP949 extension are in unicode * Hangul Syllable area. */ assert(0xac00 <= c && c <= 0xd7a3); c -= 0xac00; - OUT3(EUCKR_JAMO_FIRSTBYTE) - OUT4(u2cgk_choseong[c / 588]) - NEXT_OUT(4) + OUTBYTE3(EUCKR_JAMO_FIRSTBYTE) + OUTBYTE4(u2cgk_choseong[c / 588]) + NEXT_OUT(4); - OUT1(EUCKR_JAMO_FIRSTBYTE) - OUT2(u2cgk_jungseong[(c / 28) % 21]) - OUT3(EUCKR_JAMO_FIRSTBYTE) - OUT4(u2cgk_jongseong[c % 28]) - NEXT(1, 4) + OUTBYTE1(EUCKR_JAMO_FIRSTBYTE) + OUTBYTE2(u2cgk_jungseong[(c / 28) % 21]) + OUTBYTE3(EUCKR_JAMO_FIRSTBYTE) + OUTBYTE4(u2cgk_jongseong[c % 28]) + NEXT(1, 4); } } @@ -102,7 +104,7 @@ static const unsigned char cgk2u_jongseong[] = { /* [A1, BE] */ DECODER(euc_kr) { while (inleft > 0) { - unsigned char c = IN1; + unsigned char c = INBYTE1; if (c < 0x80) { OUTCHAR(c); @@ -113,7 +115,7 @@ DECODER(euc_kr) REQUIRE_INBUF(2) if (c == EUCKR_JAMO_FIRSTBYTE && - IN2 == EUCKR_JAMO_FILLER) { + INBYTE2 == EUCKR_JAMO_FILLER) { /* KS X 1001:1998 Annex 3 make-up sequence */ DBCHAR cho, jung, jong; @@ -146,7 +148,7 @@ DECODER(euc_kr) OUTCHAR(0xac00 + cho*588 + jung*28 + jong); NEXT_IN(8); } - else TRYMAP_DEC(ksx1001, writer, c ^ 0x80, IN2 ^ 0x80) { + else TRYMAP_DEC(ksx1001, writer, c ^ 0x80, INBYTE2 ^ 0x80) { NEXT_IN(2); } else @@ -164,27 +166,29 @@ DECODER(euc_kr) ENCODER(cp949) { - while (inleft > 0) { - Py_UCS4 c = IN1; + while (*inpos < inlen) { + Py_UCS4 c = INCHAR1; DBCHAR code; if (c < 0x80) { - WRITE1((unsigned char)c) - NEXT(1, 1) + WRITEBYTE1((unsigned char)c) + NEXT(1, 1); continue; } - UCS4INVALID(c) + + if (c > 0xFFFF) + return 1; REQUIRE_OUTBUF(2) TRYMAP_ENC(cp949, code, c); else return 1; - OUT1((code >> 8) | 0x80) + OUTBYTE1((code >> 8) | 0x80) if (code & 0x8000) - OUT2(code & 0xFF) /* MSB set: CP949 */ + OUTBYTE2(code & 0xFF) /* MSB set: CP949 */ else - OUT2((code & 0xFF) | 0x80) /* MSB unset: ks x 1001 */ - NEXT(1, 2) + OUTBYTE2((code & 0xFF) | 0x80) /* MSB unset: ks x 1001 */ + NEXT(1, 2); } return 0; @@ -193,7 +197,7 @@ ENCODER(cp949) DECODER(cp949) { while (inleft > 0) { - unsigned char c = IN1; + unsigned char c = INBYTE1; if (c < 0x80) { OUTCHAR(c); @@ -202,8 +206,8 @@ DECODER(cp949) } REQUIRE_INBUF(2) - TRYMAP_DEC(ksx1001, writer, c ^ 0x80, IN2 ^ 0x80); - else TRYMAP_DEC(cp949ext, writer, c, IN2); + TRYMAP_DEC(ksx1001, writer, c ^ 0x80, INBYTE2 ^ 0x80); + else TRYMAP_DEC(cp949ext, writer, c, INBYTE2); else return 1; NEXT_IN(2); @@ -246,16 +250,18 @@ static const DBCHAR u2johabjamo[] = { ENCODER(johab) { - while (inleft > 0) { - Py_UCS4 c = IN1; + while (*inpos < inlen) { + Py_UCS4 c = INCHAR1; DBCHAR code; if (c < 0x80) { - WRITE1((unsigned char)c) - NEXT(1, 1) + WRITEBYTE1((unsigned char)c) + NEXT(1, 1); continue; } - UCS4INVALID(c) + + if (c > 0xFFFF) + return 1; REQUIRE_OUTBUF(2) @@ -281,9 +287,9 @@ ENCODER(johab) t1 = (c1 < 0x4a ? (c1 - 0x21 + 0x1b2) : (c1 - 0x21 + 0x197)); t2 = ((t1 & 1) ? 0x5e : 0) + (c2 - 0x21); - OUT1(t1 >> 1) - OUT2(t2 < 0x4e ? t2 + 0x31 : t2 + 0x43) - NEXT(1, 2) + OUTBYTE1(t1 >> 1) + OUTBYTE2(t2 < 0x4e ? t2 + 0x31 : t2 + 0x43) + NEXT(1, 2); continue; } else @@ -292,9 +298,9 @@ ENCODER(johab) else return 1; - OUT1(code >> 8) - OUT2(code & 0xff) - NEXT(1, 2) + OUTBYTE1(code >> 8) + OUTBYTE2(code & 0xff) + NEXT(1, 2); } return 0; @@ -344,7 +350,7 @@ static const unsigned char johabjamo_jongseong[32] = { DECODER(johab) { while (inleft > 0) { - unsigned char c = IN1, c2; + unsigned char c = INBYTE1, c2; if (c < 0x80) { OUTCHAR(c); @@ -353,7 +359,7 @@ DECODER(johab) } REQUIRE_INBUF(2) - c2 = IN2; + c2 = INBYTE2; if (c < 0xd8) { /* johab hangul */ diff --git a/Modules/cjkcodecs/_codecs_tw.c b/Modules/cjkcodecs/_codecs_tw.c index a91c01b..80e0b81 100644 --- a/Modules/cjkcodecs/_codecs_tw.c +++ b/Modules/cjkcodecs/_codecs_tw.c @@ -13,26 +13,28 @@ ENCODER(big5) { - while (inleft > 0) { - Py_UCS4 c = **inbuf; + while (*inpos < inlen) { + Py_UCS4 c = INCHAR1; DBCHAR code; if (c < 0x80) { REQUIRE_OUTBUF(1) **outbuf = (unsigned char)c; - NEXT(1, 1) + NEXT(1, 1); continue; } - UCS4INVALID(c) + + if (c > 0xFFFF) + return 1; REQUIRE_OUTBUF(2) TRYMAP_ENC(big5, code, c); else return 1; - OUT1(code >> 8) - OUT2(code & 0xFF) - NEXT(1, 2) + OUTBYTE1(code >> 8) + OUTBYTE2(code & 0xFF) + NEXT(1, 2); } return 0; @@ -41,7 +43,7 @@ ENCODER(big5) DECODER(big5) { while (inleft > 0) { - unsigned char c = IN1; + unsigned char c = INBYTE1; if (c < 0x80) { OUTCHAR(c); @@ -50,7 +52,7 @@ DECODER(big5) } REQUIRE_INBUF(2) - TRYMAP_DEC(big5, writer, c, IN2) { + TRYMAP_DEC(big5, writer, c, INBYTE2) { NEXT_IN(2); } else return 1; @@ -66,25 +68,27 @@ DECODER(big5) ENCODER(cp950) { - while (inleft > 0) { - Py_UCS4 c = IN1; + while (*inpos < inlen) { + Py_UCS4 c = INCHAR1; DBCHAR code; if (c < 0x80) { - WRITE1((unsigned char)c) - NEXT(1, 1) + WRITEBYTE1((unsigned char)c) + NEXT(1, 1); continue; } - UCS4INVALID(c) + + if (c > 0xFFFF) + return 1; REQUIRE_OUTBUF(2) TRYMAP_ENC(cp950ext, code, c); else TRYMAP_ENC(big5, code, c); else return 1; - OUT1(code >> 8) - OUT2(code & 0xFF) - NEXT(1, 2) + OUTBYTE1(code >> 8) + OUTBYTE2(code & 0xFF) + NEXT(1, 2); } return 0; @@ -93,7 +97,7 @@ ENCODER(cp950) DECODER(cp950) { while (inleft > 0) { - unsigned char c = IN1; + unsigned char c = INBYTE1; if (c < 0x80) { OUTCHAR(c); @@ -103,8 +107,8 @@ DECODER(cp950) REQUIRE_INBUF(2) - TRYMAP_DEC(cp950ext, writer, c, IN2); - else TRYMAP_DEC(big5, writer, c, IN2); + TRYMAP_DEC(cp950ext, writer, c, INBYTE2); + else TRYMAP_DEC(big5, writer, c, INBYTE2); else return 1; NEXT_IN(2); diff --git a/Modules/cjkcodecs/cjkcodecs.h b/Modules/cjkcodecs/cjkcodecs.h index 65b5c07..18cc02f 100644 --- a/Modules/cjkcodecs/cjkcodecs.h +++ b/Modules/cjkcodecs/cjkcodecs.h @@ -72,7 +72,8 @@ static const struct dbcs_map *mapping_list; #define ENCODER(encoding) \ static Py_ssize_t encoding##_encode( \ MultibyteCodec_State *state, const void *config, \ - const Py_UNICODE **inbuf, Py_ssize_t inleft, \ + int kind, void *data, \ + Py_ssize_t *inpos, Py_ssize_t inlen, \ unsigned char **outbuf, Py_ssize_t outleft, int flags) #define ENCODER_RESET(encoding) \ static Py_ssize_t encoding##_encode_reset( \ @@ -91,25 +92,25 @@ static const struct dbcs_map *mapping_list; static Py_ssize_t encoding##_decode_reset( \ MultibyteCodec_State *state, const void *config) -#if Py_UNICODE_SIZE == 4 -#define UCS4INVALID(code) \ - if ((code) > 0xFFFF) \ - return 1; -#else -#define UCS4INVALID(code) \ - if (0) ; -#endif - #define NEXT_IN(i) \ do { \ (*inbuf) += (i); \ (inleft) -= (i); \ } while (0) +#define NEXT_INCHAR(i) \ + do { \ + (*inpos) += (i); \ + } while (0) #define NEXT_OUT(o) \ - (*outbuf) += (o); \ - (outleft) -= (o); + do { \ + (*outbuf) += (o); \ + (outleft) -= (o); \ + } while (0) #define NEXT(i, o) \ - NEXT_IN(i); NEXT_OUT(o) + do { \ + NEXT_INCHAR(i); \ + NEXT_OUT(o); \ + } while (0) #define REQUIRE_INBUF(n) \ if (inleft < (n)) \ @@ -118,10 +119,13 @@ static const struct dbcs_map *mapping_list; if (outleft < (n)) \ return MBERR_TOOSMALL; -#define IN1 ((*inbuf)[0]) -#define IN2 ((*inbuf)[1]) -#define IN3 ((*inbuf)[2]) -#define IN4 ((*inbuf)[3]) +#define INBYTE1 ((*inbuf)[0]) +#define INBYTE2 ((*inbuf)[1]) +#define INBYTE3 ((*inbuf)[2]) +#define INBYTE4 ((*inbuf)[3]) + +#define INCHAR1 PyUnicode_READ(kind, data, *inpos) +#define INCHAR2 PyUnicode_READ(kind, data, *inpos + 1) #define OUTCHAR(c) \ do { \ @@ -140,24 +144,24 @@ static const struct dbcs_map *mapping_list; writer->pos += 2; \ } while (0) -#define OUT1(c) ((*outbuf)[0]) = (c); -#define OUT2(c) ((*outbuf)[1]) = (c); -#define OUT3(c) ((*outbuf)[2]) = (c); -#define OUT4(c) ((*outbuf)[3]) = (c); +#define OUTBYTE1(c) ((*outbuf)[0]) = (c); +#define OUTBYTE2(c) ((*outbuf)[1]) = (c); +#define OUTBYTE3(c) ((*outbuf)[2]) = (c); +#define OUTBYTE4(c) ((*outbuf)[3]) = (c); -#define WRITE1(c1) \ +#define WRITEBYTE1(c1) \ REQUIRE_OUTBUF(1) \ (*outbuf)[0] = (c1); -#define WRITE2(c1, c2) \ +#define WRITEBYTE2(c1, c2) \ REQUIRE_OUTBUF(2) \ (*outbuf)[0] = (c1); \ (*outbuf)[1] = (c2); -#define WRITE3(c1, c2, c3) \ +#define WRITEBYTE3(c1, c2, c3) \ REQUIRE_OUTBUF(3) \ (*outbuf)[0] = (c1); \ (*outbuf)[1] = (c2); \ (*outbuf)[2] = (c3); -#define WRITE4(c1, c2, c3, c4) \ +#define WRITEBYTE4(c1, c2, c3, c4) \ REQUIRE_OUTBUF(4) \ (*outbuf)[0] = (c1); \ (*outbuf)[1] = (c2); \ @@ -209,20 +213,6 @@ _TRYMAP_DEC_WRITE(_PyUnicodeWriter *writer, Py_UCS4 c) #define TRYMAP_DEC_MPLANE(charset, writer, plane, c1, c2) \ if _TRYMAP_DEC(&charset##_decmap[plane][c1], writer, c2) -#if Py_UNICODE_SIZE == 2 -#define DECODE_SURROGATE(c) \ - if (Py_UNICODE_IS_HIGH_SURROGATE(c)) { \ - REQUIRE_INBUF(2) \ - if (Py_UNICODE_IS_LOW_SURROGATE(IN2)) { \ - c = Py_UNICODE_JOIN_SURROGATES(c, IN2); \ - } \ - } -#define GET_INSIZE(c) ((c) > 0xffff ? 2 : 1) -#else -#define DECODE_SURROGATE(c) {;} -#define GET_INSIZE(c) 1 -#endif - #define BEGIN_MAPPINGS_LIST static const struct dbcs_map _mapping_list[] = { #define MAPPING_ENCONLY(enc) {#enc, (void*)enc##_encmap, NULL}, #define MAPPING_DECONLY(enc) {#enc, NULL, (void*)enc##_decmap}, diff --git a/Modules/cjkcodecs/multibytecodec.c b/Modules/cjkcodecs/multibytecodec.c index 7e16b63..5a916fd 100644 --- a/Modules/cjkcodecs/multibytecodec.c +++ b/Modules/cjkcodecs/multibytecodec.c @@ -10,7 +10,8 @@ #include "multibytecodec.h" typedef struct { - const Py_UNICODE *inbuf, *inbuf_top, *inbuf_end; + PyObject *inobj; + Py_ssize_t inpos, inlen; unsigned char *outbuf, *outbuf_end; PyObject *excobj, *outobj; } MultibyteEncodeBuffer; @@ -45,7 +46,7 @@ static char *incrementalkwarglist[] = {"input", "final", NULL}; static char *streamkwarglist[] = {"stream", "errors", NULL}; static PyObject *multibytecodec_encode(MultibyteCodec *, - MultibyteCodec_State *, const Py_UNICODE **, Py_ssize_t, + MultibyteCodec_State *, PyObject *, Py_ssize_t *, PyObject *, int); #define MBENC_RESET MBENC_MAX<<1 /* reset after an encoding session */ @@ -224,7 +225,7 @@ multibytecodec_encerror(MultibyteCodec *codec, return 0; /* retry it */ case MBERR_TOOFEW: reason = "incomplete multibyte sequence"; - esize = (Py_ssize_t)(buf->inbuf_end - buf->inbuf); + esize = (Py_ssize_t)buf->inpos; break; case MBERR_INTERNAL: PyErr_SetString(PyExc_RuntimeError, @@ -238,14 +239,24 @@ multibytecodec_encerror(MultibyteCodec *codec, } if (errors == ERROR_REPLACE) { - const Py_UNICODE replchar = '?', *inbuf = &replchar; + PyObject *replchar; Py_ssize_t r; + Py_ssize_t inpos; + int kind; + void *data; + + replchar = PyUnicode_FromOrdinal('?'); + if (replchar == NULL) + goto errorexit; + kind = PyUnicode_KIND(replchar); + data = PyUnicode_DATA(replchar); + inpos = 0; for (;;) { - Py_ssize_t outleft; + Py_ssize_t outleft = (Py_ssize_t)(buf->outbuf_end - buf->outbuf); - outleft = (Py_ssize_t)(buf->outbuf_end - buf->outbuf); - r = codec->encode(state, codec->config, &inbuf, 1, + r = codec->encode(state, codec->config, + kind, data, &inpos, 1, &buf->outbuf, outleft, 0); if (r == MBERR_TOOSMALL) { REQUIRE_ENCODEBUFFER(buf, -1); @@ -255,25 +266,27 @@ multibytecodec_encerror(MultibyteCodec *codec, break; } + Py_DECREF(replchar); + if (r != 0) { REQUIRE_ENCODEBUFFER(buf, 1); *buf->outbuf++ = '?'; } } if (errors == ERROR_IGNORE || errors == ERROR_REPLACE) { - buf->inbuf += esize; + buf->inpos += esize; return 0; } - start = (Py_ssize_t)(buf->inbuf - buf->inbuf_top); + start = (Py_ssize_t)buf->inpos; end = start + esize; /* use cached exception object if available */ if (buf->excobj == NULL) { - buf->excobj = PyUnicodeEncodeError_Create(codec->encoding, - buf->inbuf_top, - buf->inbuf_end - buf->inbuf_top, - start, end, reason); + buf->excobj = PyObject_CallFunction(PyExc_UnicodeEncodeError, + "sOnns", + codec->encoding, buf->inobj, + start, end, reason); if (buf->excobj == NULL) goto errorexit; } @@ -302,10 +315,10 @@ multibytecodec_encerror(MultibyteCodec *codec, } if (PyUnicode_Check(tobj)) { - const Py_UNICODE *uraw = PyUnicode_AS_UNICODE(tobj); + Py_ssize_t inpos; - retstr = multibytecodec_encode(codec, state, &uraw, - PyUnicode_GET_SIZE(tobj), ERROR_STRICT, + retstr = multibytecodec_encode(codec, state, tobj, + &inpos, ERROR_STRICT, MBENC_FLUSH); if (retstr == NULL) goto errorexit; @@ -324,15 +337,15 @@ multibytecodec_encerror(MultibyteCodec *codec, newpos = PyLong_AsSsize_t(PyTuple_GET_ITEM(retobj, 1)); if (newpos < 0 && !PyErr_Occurred()) - newpos += (Py_ssize_t)(buf->inbuf_end - buf->inbuf_top); - if (newpos < 0 || buf->inbuf_top + newpos > buf->inbuf_end) { + newpos += (Py_ssize_t)buf->inlen; + if (newpos < 0 || newpos > buf->inlen) { PyErr_Clear(); PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); goto errorexit; } - buf->inbuf = buf->inbuf_top + newpos; + buf->inpos = newpos; Py_DECREF(retobj); Py_DECREF(retstr); @@ -449,19 +462,29 @@ errorexit: static PyObject * multibytecodec_encode(MultibyteCodec *codec, MultibyteCodec_State *state, - const Py_UNICODE **data, Py_ssize_t datalen, + PyObject *text, Py_ssize_t *inpos_t, PyObject *errors, int flags) { MultibyteEncodeBuffer buf; Py_ssize_t finalsize, r = 0; + Py_ssize_t datalen; + int kind; + void *data; + + if (PyUnicode_READY(text) < 0) + return NULL; + datalen = PyUnicode_GET_LENGTH(text); if (datalen == 0 && !(flags & MBENC_RESET)) return PyBytes_FromStringAndSize(NULL, 0); buf.excobj = NULL; buf.outobj = NULL; - buf.inbuf = buf.inbuf_top = *data; - buf.inbuf_end = buf.inbuf_top + datalen; + buf.inobj = text; /* borrowed reference */ + buf.inpos = 0; + buf.inlen = datalen; + kind = PyUnicode_KIND(buf.inobj); + data = PyUnicode_DATA(buf.inobj); if (datalen > (PY_SSIZE_T_MAX - 16) / 2) { PyErr_NoMemory(); @@ -474,14 +497,14 @@ multibytecodec_encode(MultibyteCodec *codec, buf.outbuf = (unsigned char *)PyBytes_AS_STRING(buf.outobj); buf.outbuf_end = buf.outbuf + PyBytes_GET_SIZE(buf.outobj); - while (buf.inbuf < buf.inbuf_end) { - Py_ssize_t inleft, outleft; - + while (buf.inpos < buf.inlen) { /* we don't reuse inleft and outleft here. * error callbacks can relocate the cursor anywhere on buffer*/ - inleft = (Py_ssize_t)(buf.inbuf_end - buf.inbuf); - outleft = (Py_ssize_t)(buf.outbuf_end - buf.outbuf); - r = codec->encode(state, codec->config, &buf.inbuf, inleft, + Py_ssize_t outleft = (Py_ssize_t)(buf.outbuf_end - buf.outbuf); + + r = codec->encode(state, codec->config, + kind, data, + &buf.inpos, buf.inlen, &buf.outbuf, outleft, flags); if ((r == 0) || (r == MBERR_TOOFEW && !(flags & MBENC_FLUSH))) break; @@ -512,7 +535,8 @@ multibytecodec_encode(MultibyteCodec *codec, if (_PyBytes_Resize(&buf.outobj, finalsize) == -1) goto errorexit; - *data = buf.inbuf; + if (inpos_t) + *inpos_t = buf.inpos; Py_XDECREF(buf.excobj); return buf.outobj; @@ -527,7 +551,6 @@ MultibyteCodec_Encode(MultibyteCodecObject *self, PyObject *args, PyObject *kwargs) { MultibyteCodec_State state; - Py_UNICODE *data; PyObject *errorcb, *r, *arg, *ucvt; const char *errors = NULL; Py_ssize_t datalen; @@ -550,11 +573,11 @@ MultibyteCodec_Encode(MultibyteCodecObject *self, } } - data = PyUnicode_AsUnicodeAndSize(arg, &datalen); - if (data == NULL) { + if (PyUnicode_READY(arg) < 0) { Py_XDECREF(ucvt); return NULL; } + datalen = PyUnicode_GET_LENGTH(arg); errorcb = internal_error_callback(errors); if (errorcb == NULL) { @@ -566,7 +589,7 @@ MultibyteCodec_Encode(MultibyteCodecObject *self, self->codec->encinit(&state, self->codec->config) != 0) goto errorexit; r = multibytecodec_encode(self->codec, &state, - (const Py_UNICODE **)&data, datalen, errorcb, + arg, NULL, errorcb, MBENC_FLUSH | MBENC_RESET); if (r == NULL) goto errorexit; @@ -712,8 +735,9 @@ encoder_encode_stateful(MultibyteStatefulEncoderContext *ctx, PyObject *unistr, int final) { PyObject *ucvt, *r = NULL; - Py_UNICODE *inbuf, *inbuf_end, *inbuf_tmp = NULL; - Py_ssize_t datalen, origpending; + PyObject *inbuf = NULL; + Py_ssize_t inpos, datalen; + PyObject *origpending = NULL; wchar_t *data; if (PyUnicode_Check(unistr)) @@ -733,66 +757,64 @@ encoder_encode_stateful(MultibyteStatefulEncoderContext *ctx, data = PyUnicode_AsUnicodeAndSize(unistr, &datalen); if (data == NULL) goto errorexit; - origpending = ctx->pendingsize; - if (origpending > 0) { - if (datalen > PY_SSIZE_T_MAX - ctx->pendingsize) { - PyErr_NoMemory(); - /* inbuf_tmp == NULL */ - goto errorexit; - } - inbuf_tmp = PyMem_New(Py_UNICODE, datalen + ctx->pendingsize); + if (ctx->pending) { + PyObject *inbuf_tmp; + + Py_INCREF(ctx->pending); + origpending = ctx->pending; + + Py_INCREF(ctx->pending); + inbuf_tmp = ctx->pending; + PyUnicode_Append(&inbuf_tmp, unistr); if (inbuf_tmp == NULL) goto errorexit; - memcpy(inbuf_tmp, ctx->pending, - Py_UNICODE_SIZE * ctx->pendingsize); - memcpy(inbuf_tmp + ctx->pendingsize, - PyUnicode_AS_UNICODE(unistr), - Py_UNICODE_SIZE * datalen); - datalen += ctx->pendingsize; - ctx->pendingsize = 0; + Py_CLEAR(ctx->pending); inbuf = inbuf_tmp; } - else - inbuf = (Py_UNICODE *)PyUnicode_AS_UNICODE(unistr); + else { + origpending = NULL; - inbuf_end = inbuf + datalen; + Py_INCREF(unistr); + inbuf = unistr; + } + if (PyUnicode_READY(inbuf) < 0) + goto errorexit; + inpos = 0; + datalen = PyUnicode_GET_LENGTH(inbuf); r = multibytecodec_encode(ctx->codec, &ctx->state, - (const Py_UNICODE **)&inbuf, datalen, - ctx->errors, final ? MBENC_FLUSH | MBENC_RESET : 0); + inbuf, &inpos, + ctx->errors, final ? MBENC_FLUSH | MBENC_RESET : 0); if (r == NULL) { /* recover the original pending buffer */ - if (origpending > 0) - memcpy(ctx->pending, inbuf_tmp, - Py_UNICODE_SIZE * origpending); - ctx->pendingsize = origpending; + Py_CLEAR(ctx->pending); + ctx->pending = origpending; + origpending = NULL; goto errorexit; } - if (inbuf < inbuf_end) { - ctx->pendingsize = (Py_ssize_t)(inbuf_end - inbuf); - if (ctx->pendingsize > MAXENCPENDING) { + if (inpos < datalen) { + if (datalen - inpos > MAXENCPENDING) { /* normal codecs can't reach here */ - ctx->pendingsize = 0; PyErr_SetString(PyExc_UnicodeError, "pending buffer overflow"); goto errorexit; } - memcpy(ctx->pending, inbuf, - ctx->pendingsize * Py_UNICODE_SIZE); + ctx->pending = PyUnicode_Substring(inbuf, inpos, datalen); + if (ctx->pending == NULL) { + /* normal codecs can't reach here */ + goto errorexit; + } } - if (inbuf_tmp != NULL) - PyMem_Del(inbuf_tmp); Py_XDECREF(ucvt); return r; errorexit: - if (inbuf_tmp != NULL) - PyMem_Del(inbuf_tmp); Py_XDECREF(r); Py_XDECREF(ucvt); + Py_XDECREF(origpending); return NULL; } @@ -876,7 +898,7 @@ mbiencoder_reset(MultibyteIncrementalEncoderObject *self) if (r != 0) return NULL; } - self->pendingsize = 0; + Py_CLEAR(self->pending); Py_RETURN_NONE; } @@ -912,7 +934,7 @@ mbiencoder_new(PyTypeObject *type, PyObject *args, PyObject *kwds) } self->codec = ((MultibyteCodecObject *)codec)->codec; - self->pendingsize = 0; + self->pending = NULL; self->errors = internal_error_callback(errors); if (self->errors == NULL) goto errorexit; @@ -1598,18 +1620,16 @@ mbstreamwriter_writelines(MultibyteStreamWriterObject *self, PyObject *lines) static PyObject * mbstreamwriter_reset(MultibyteStreamWriterObject *self) { - const Py_UNICODE *pending; PyObject *pwrt; - pending = self->pending; pwrt = multibytecodec_encode(self->codec, &self->state, - &pending, self->pendingsize, self->errors, + self->pending, NULL, self->errors, MBENC_FLUSH | MBENC_RESET); /* some pending buffer can be truncated when UnicodeEncodeError is * raised on 'strict' mode. but, 'reset' method is designed to * reset the pending buffer or states so failed string sequence * ought to be missed */ - self->pendingsize = 0; + Py_CLEAR(self->pending); if (pwrt == NULL) return NULL; @@ -1655,7 +1675,7 @@ mbstreamwriter_new(PyTypeObject *type, PyObject *args, PyObject *kwds) self->codec = ((MultibyteCodecObject *)codec)->codec; self->stream = stream; Py_INCREF(stream); - self->pendingsize = 0; + self->pending = NULL; self->errors = internal_error_callback(errors); if (self->errors == NULL) goto errorexit; diff --git a/Modules/cjkcodecs/multibytecodec.h b/Modules/cjkcodecs/multibytecodec.h index 8e71266..3050aeb 100644 --- a/Modules/cjkcodecs/multibytecodec.h +++ b/Modules/cjkcodecs/multibytecodec.h @@ -27,7 +27,8 @@ typedef union { typedef int (*mbcodec_init)(const void *config); typedef Py_ssize_t (*mbencode_func)(MultibyteCodec_State *state, const void *config, - const Py_UNICODE **inbuf, Py_ssize_t inleft, + int kind, void *data, + Py_ssize_t *inpos, Py_ssize_t inlen, unsigned char **outbuf, Py_ssize_t outleft, int flags); typedef int (*mbencodeinit_func)(MultibyteCodec_State *state, @@ -75,8 +76,7 @@ typedef struct { #define MAXENCPENDING 2 #define _MultibyteStatefulEncoder_HEAD \ _MultibyteStatefulCodec_HEAD \ - Py_UNICODE pending[MAXENCPENDING]; \ - Py_ssize_t pendingsize; + PyObject *pending; typedef struct { _MultibyteStatefulEncoder_HEAD } MultibyteStatefulEncoderContext; |