diff options
Diffstat (limited to 'Modules/cjkcodecs/_codecs_cn.c')
-rw-r--r-- | Modules/cjkcodecs/_codecs_cn.c | 293 |
1 files changed, 156 insertions, 137 deletions
diff --git a/Modules/cjkcodecs/_codecs_cn.c b/Modules/cjkcodecs/_codecs_cn.c index 9e9e96c..013c3fb 100644 --- a/Modules/cjkcodecs/_codecs_cn.c +++ b/Modules/cjkcodecs/_codecs_cn.c @@ -23,18 +23,33 @@ * A844 undefined U+2015 HORIZONTAL BAR */ -#define GBK_DECODE(dc1, dc2, assi) \ - if ((dc1) == 0xa1 && (dc2) == 0xaa) (assi) = 0x2014; \ - else if ((dc1) == 0xa8 && (dc2) == 0x44) (assi) = 0x2015; \ - else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7; \ - else TRYMAP_DEC(gb2312, assi, dc1 ^ 0x80, dc2 ^ 0x80); \ - else TRYMAP_DEC(gbkext, assi, dc1, dc2); - -#define GBK_ENCODE(code, assi) \ - if ((code) == 0x2014) (assi) = 0xa1aa; \ - else if ((code) == 0x2015) (assi) = 0xa844; \ - else if ((code) == 0x00b7) (assi) = 0xa1a4; \ - else if ((code) != 0x30fb && TRYMAP_ENC_COND(gbcommon, assi, code)); +#define GBK_DECODE(dc1, dc2, writer) \ + if ((dc1) == 0xa1 && (dc2) == 0xaa) { \ + OUTCHAR(0x2014); \ + } \ + else if ((dc1) == 0xa8 && (dc2) == 0x44) { \ + OUTCHAR(0x2015); \ + } \ + else if ((dc1) == 0xa1 && (dc2) == 0xa4) { \ + OUTCHAR(0x00b7); \ + } \ + else if (TRYMAP_DEC(gb2312, decoded, dc1 ^ 0x80, dc2 ^ 0x80)) { \ + OUTCHAR(decoded); \ + } \ + else if (TRYMAP_DEC(gbkext, decoded, dc1, dc2)) { \ + OUTCHAR(decoded); \ + } + +#define GBK_ENCODE(code, assi) \ + if ((code) == 0x2014) { \ + (assi) = 0xa1aa; \ + } else if ((code) == 0x2015) { \ + (assi) = 0xa844; \ + } else if ((code) == 0x00b7) { \ + (assi) = 0xa1a4; \ + } else if ((code) != 0x30fb && TRYMAP_ENC(gbcommon, assi, code)) { \ + ; \ + } /* * GB2312 codec @@ -42,27 +57,31 @@ ENCODER(gb2312) { - while (inleft > 0) { - Py_UNICODE c = IN1; + while (*inpos < inlen) { + Py_UCS4 c = INCHAR1; DBCHAR code; if (c < 0x80) { - WRITE1((unsigned char)c) - NEXT(1, 1) + WRITEBYTE1((unsigned char)c); + NEXT(1, 1); continue; } - UCS4INVALID(c) - REQUIRE_OUTBUF(2) - TRYMAP_ENC(gbcommon, code, c); - else return 1; + if (c > 0xFFFF) + return 1; + + REQUIRE_OUTBUF(2); + if (TRYMAP_ENC(gbcommon, code, c)) + ; + else + return 1; if (code & 0x8000) /* MSB set: GBK */ return 1; - OUT1((code >> 8) | 0x80) - OUT2((code & 0xFF) | 0x80) - NEXT(1, 2) + OUTBYTE1((code >> 8) | 0x80); + OUTBYTE2((code & 0xFF) | 0x80); + NEXT(1, 2); } return 0; @@ -72,20 +91,21 @@ DECODER(gb2312) { while (inleft > 0) { unsigned char c = **inbuf; - - REQUIRE_OUTBUF(1) + Py_UCS4 decoded; if (c < 0x80) { - OUT1(c) - NEXT(1, 1) + OUTCHAR(c); + NEXT_IN(1); continue; } - REQUIRE_INBUF(2) - TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) { - NEXT(2, 1) + REQUIRE_INBUF(2); + if (TRYMAP_DEC(gb2312, decoded, c ^ 0x80, INBYTE2 ^ 0x80)) { + OUTCHAR(decoded); + NEXT_IN(2); } - else return 1; + else + return 1; } return 0; @@ -98,28 +118,31 @@ DECODER(gb2312) ENCODER(gbk) { - while (inleft > 0) { - Py_UNICODE c = IN1; + while (*inpos < inlen) { + Py_UCS4 c = INCHAR1; DBCHAR code; if (c < 0x80) { - WRITE1((unsigned char)c) - NEXT(1, 1) + WRITEBYTE1((unsigned char)c); + NEXT(1, 1); continue; } - UCS4INVALID(c) - REQUIRE_OUTBUF(2) + if (c > 0xFFFF) + return 1; + + REQUIRE_OUTBUF(2); GBK_ENCODE(c, code) - else return 1; + else + return 1; - OUT1((code >> 8) | 0x80) + OUTBYTE1((code >> 8) | 0x80); if (code & 0x8000) - OUT2((code & 0xFF)) /* MSB set: GBK */ + OUTBYTE2((code & 0xFF)); /* MSB set: GBK */ else - OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */ - NEXT(1, 2) + OUTBYTE2((code & 0xFF) | 0x80); /* MSB unset: GB2312 */ + NEXT(1, 2); } return 0; @@ -128,22 +151,22 @@ ENCODER(gbk) DECODER(gbk) { while (inleft > 0) { - unsigned char c = IN1; - - REQUIRE_OUTBUF(1) + unsigned char c = INBYTE1; + Py_UCS4 decoded; if (c < 0x80) { - OUT1(c) - NEXT(1, 1) + OUTCHAR(c); + NEXT_IN(1); continue; } - REQUIRE_INBUF(2) + REQUIRE_INBUF(2); - GBK_DECODE(c, IN2, **outbuf) - else return 1; + GBK_DECODE(c, INBYTE2, writer) + else + return 1; - NEXT(2, 1) + NEXT_IN(2); } return 0; @@ -156,72 +179,63 @@ DECODER(gbk) ENCODER(gb18030) { - while (inleft > 0) { - ucs4_t c = IN1; + while (*inpos < inlen) { + Py_UCS4 c = INCHAR1; DBCHAR code; if (c < 0x80) { - WRITE1(c) - NEXT(1, 1) + WRITEBYTE1(c); + NEXT(1, 1); continue; } - DECODE_SURROGATE(c) - if (c > 0x10FFFF) -#if Py_UNICODE_SIZE == 2 - return 2; /* surrogates pair */ -#else - return 1; -#endif - else if (c >= 0x10000) { - ucs4_t tc = c - 0x10000; + if (c >= 0x10000) { + Py_UCS4 tc = c - 0x10000; + assert (c <= 0x10FFFF); - REQUIRE_OUTBUF(4) + REQUIRE_OUTBUF(4); - OUT4((unsigned char)(tc % 10) + 0x30) + OUTBYTE4((unsigned char)(tc % 10) + 0x30); tc /= 10; - OUT3((unsigned char)(tc % 126) + 0x81) + OUTBYTE3((unsigned char)(tc % 126) + 0x81); tc /= 126; - OUT2((unsigned char)(tc % 10) + 0x30) + OUTBYTE2((unsigned char)(tc % 10) + 0x30); tc /= 10; - OUT1((unsigned char)(tc + 0x90)) + OUTBYTE1((unsigned char)(tc + 0x90)); -#if Py_UNICODE_SIZE == 2 - NEXT(2, 4) /* surrogates pair */ -#else - NEXT(1, 4) -#endif + NEXT(1, 4); continue; } - REQUIRE_OUTBUF(2) + REQUIRE_OUTBUF(2); GBK_ENCODE(c, code) - else TRYMAP_ENC(gb18030ext, code, c); + else if (TRYMAP_ENC(gb18030ext, code, c)) + ; else { const struct _gb18030_to_unibmp_ranges *utrrange; - REQUIRE_OUTBUF(4) + REQUIRE_OUTBUF(4); for (utrrange = gb18030_to_unibmp_ranges; utrrange->first != 0; utrrange++) if (utrrange->first <= c && c <= utrrange->last) { - Py_UNICODE tc; + Py_UCS4 tc; tc = c - utrrange->first + utrrange->base; - OUT4((unsigned char)(tc % 10) + 0x30) + OUTBYTE4((unsigned char)(tc % 10) + 0x30); tc /= 10; - OUT3((unsigned char)(tc % 126) + 0x81) + OUTBYTE3((unsigned char)(tc % 126) + 0x81); tc /= 126; - OUT2((unsigned char)(tc % 10) + 0x30) + OUTBYTE2((unsigned char)(tc % 10) + 0x30); tc /= 10; - OUT1((unsigned char)tc + 0x81) + OUTBYTE1((unsigned char)tc + 0x81); - NEXT(1, 4) + NEXT(1, 4); break; } @@ -230,13 +244,13 @@ ENCODER(gb18030) continue; } - OUT1((code >> 8) | 0x80) + OUTBYTE1((code >> 8) | 0x80); if (code & 0x8000) - OUT2((code & 0xFF)) /* MSB set: GBK or GB18030ext */ + OUTBYTE2((code & 0xFF)); /* MSB set: GBK or GB18030ext */ else - OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */ + OUTBYTE2((code & 0xFF) | 0x80); /* MSB unset: GB2312 */ - NEXT(1, 2) + NEXT(1, 2); } return 0; @@ -245,61 +259,62 @@ ENCODER(gb18030) DECODER(gb18030) { while (inleft > 0) { - unsigned char c = IN1, c2; - - REQUIRE_OUTBUF(1) + unsigned char c = INBYTE1, c2; + Py_UCS4 decoded; if (c < 0x80) { - OUT1(c) - NEXT(1, 1) + OUTCHAR(c); + NEXT_IN(1); continue; } - REQUIRE_INBUF(2) + REQUIRE_INBUF(2); - c2 = IN2; + c2 = INBYTE2; if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */ const struct _gb18030_to_unibmp_ranges *utr; unsigned char c3, c4; - ucs4_t lseq; + Py_UCS4 lseq; - REQUIRE_INBUF(4) - c3 = IN3; - c4 = IN4; + REQUIRE_INBUF(4); + c3 = INBYTE3; + c4 = INBYTE4; if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39) return 1; c -= 0x81; c2 -= 0x30; c3 -= 0x81; c4 -= 0x30; if (c < 4) { /* U+0080 - U+FFFF */ - lseq = ((ucs4_t)c * 10 + c2) * 1260 + - (ucs4_t)c3 * 10 + c4; + lseq = ((Py_UCS4)c * 10 + c2) * 1260 + + (Py_UCS4)c3 * 10 + c4; if (lseq < 39420) { for (utr = gb18030_to_unibmp_ranges; lseq >= (utr + 1)->base; utr++) ; - OUT1(utr->first - utr->base + lseq) - NEXT(4, 1) + OUTCHAR(utr->first - utr->base + lseq); + NEXT_IN(4); continue; } } else if (c >= 15) { /* U+10000 - U+10FFFF */ - lseq = 0x10000 + (((ucs4_t)c-15) * 10 + c2) - * 1260 + (ucs4_t)c3 * 10 + c4; + lseq = 0x10000 + (((Py_UCS4)c-15) * 10 + c2) + * 1260 + (Py_UCS4)c3 * 10 + c4; if (lseq <= 0x10FFFF) { - WRITEUCS4(lseq); - NEXT_IN(4) + OUTCHAR(lseq); + NEXT_IN(4); continue; } } return 1; } - GBK_DECODE(c, c2, **outbuf) - else TRYMAP_DEC(gb18030ext, **outbuf, c, c2); - else return 1; + GBK_DECODE(c, c2, writer) + else if (TRYMAP_DEC(gb18030ext, decoded, c, c2)) + OUTCHAR(decoded); + else + return 1; - NEXT(2, 1) + NEXT_IN(2); } return 0; @@ -319,48 +334,51 @@ ENCODER_INIT(hz) ENCODER_RESET(hz) { if (state->i != 0) { - WRITE2('~', '}') + WRITEBYTE2('~', '}'); state->i = 0; - NEXT_OUT(2) + NEXT_OUT(2); } return 0; } ENCODER(hz) { - while (inleft > 0) { - Py_UNICODE c = IN1; + while (*inpos < inlen) { + Py_UCS4 c = INCHAR1; DBCHAR code; if (c < 0x80) { if (state->i == 0) { - WRITE1((unsigned char)c) - NEXT(1, 1) + WRITEBYTE1((unsigned char)c); + NEXT(1, 1); } else { - WRITE3('~', '}', (unsigned char)c) - NEXT(1, 3) + WRITEBYTE3('~', '}', (unsigned char)c); + NEXT(1, 3); state->i = 0; } continue; } - UCS4INVALID(c) + if (c > 0xFFFF) + return 1; - TRYMAP_ENC(gbcommon, code, c); - else return 1; + if (TRYMAP_ENC(gbcommon, code, c)) + ; + else + return 1; if (code & 0x8000) /* MSB set: GBK */ return 1; if (state->i == 0) { - WRITE4('~', '{', code >> 8, code & 0xff) - NEXT(1, 4) + WRITEBYTE4('~', '{', code >> 8, code & 0xff); + NEXT(1, 4); state->i = 1; } else { - WRITE2(code >> 8, code & 0xff) - NEXT(1, 2) + WRITEBYTE2(code >> 8, code & 0xff); + NEXT(1, 2); } } @@ -382,15 +400,16 @@ DECODER_RESET(hz) DECODER(hz) { while (inleft > 0) { - unsigned char c = IN1; + unsigned char c = INBYTE1; + Py_UCS4 decoded; if (c == '~') { - unsigned char c2 = IN2; + unsigned char c2 = INBYTE2; - REQUIRE_INBUF(2) + REQUIRE_INBUF(2); if (c2 == '~') { - WRITE1('~') - NEXT(2, 1) + OUTCHAR('~'); + NEXT_IN(2); continue; } else if (c2 == '{' && state->i == 0) @@ -401,7 +420,7 @@ DECODER(hz) ; /* line-continuation */ else return 1; - NEXT(2, 0); + NEXT_IN(2); continue; } @@ -409,14 +428,14 @@ DECODER(hz) return 1; if (state->i == 0) { /* ASCII mode */ - WRITE1(c) - NEXT(1, 1) + OUTCHAR(c); + NEXT_IN(1); } else { /* GB mode */ - REQUIRE_INBUF(2) - REQUIRE_OUTBUF(1) - TRYMAP_DEC(gb2312, **outbuf, c, IN2) { - NEXT(2, 1) + REQUIRE_INBUF(2); + if (TRYMAP_DEC(gb2312, decoded, c, INBYTE2)) { + OUTCHAR(decoded); + NEXT_IN(2); } else return 1; |