diff options
Diffstat (limited to 'Modules/cjkcodecs/_codecs_jp.c')
-rw-r--r-- | Modules/cjkcodecs/_codecs_jp.c | 731 |
1 files changed, 731 insertions, 0 deletions
diff --git a/Modules/cjkcodecs/_codecs_jp.c b/Modules/cjkcodecs/_codecs_jp.c new file mode 100644 index 0000000..5ff9494 --- /dev/null +++ b/Modules/cjkcodecs/_codecs_jp.c @@ -0,0 +1,731 @@ +/* + * _codecs_jp.c: Codecs collection for Japanese encodings + * + * Written by Hye-Shik Chang <perky@FreeBSD.org> + * $CJKCodecs: _codecs_jp.c,v 1.14 2004/07/07 17:54:47 perky Exp $ + */ + +#define USING_BINARY_PAIR_SEARCH +#define EMPBASE 0x20000 + +#include "cjkcodecs.h" +#include "mappings_jp.h" +#include "mappings_jisx0213_pair.h" +#include "alg_jisx0201.h" +#include "emu_jisx0213_2000.h" + +/* + * CP932 codec + */ + +ENCODER(cp932) +{ + while (inleft > 0) { + Py_UNICODE c = IN1; + DBCHAR code; + unsigned char c1, c2; + + if (c <= 0x80) { + WRITE1((unsigned char)c) + NEXT(1, 1) + continue; + } + else if (c >= 0xff61 && c <= 0xff9f) { + WRITE1(c - 0xfec0) + NEXT(1, 1) + continue; + } + else if (c >= 0xf8f0 && c <= 0xf8f3) { + /* Windows compatability */ + REQUIRE_OUTBUF(1) + if (c == 0xf8f0) + OUT1(0xa0) + else + OUT1(c - 0xfef1 + 0xfd) + NEXT(1, 1) + continue; + } + + UCS4INVALID(c) + REQUIRE_OUTBUF(2) + + TRYMAP_ENC(cp932ext, code, c) { + OUT1(code >> 8) + OUT2(code & 0xff) + } + else TRYMAP_ENC(jisxcommon, code, c) { + if (code & 0x8000) /* MSB set: JIS X 0212 */ + return 1; + + /* JIS X 0208 */ + c1 = code >> 8; + c2 = code & 0xff; + c2 = (((c1 - 0x21) & 1) ? 0x5e : 0) + (c2 - 0x21); + c1 = (c1 - 0x21) >> 1; + OUT1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1) + OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41) + } + else if (c >= 0xe000 && c < 0xe758) { + /* User-defined area */ + c1 = (Py_UNICODE)(c - 0xe000) / 188; + c2 = (Py_UNICODE)(c - 0xe000) % 188; + OUT1(c1 + 0xf0) + OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41) + } + else + return 1; + + NEXT(1, 2) + } + + return 0; +} + +DECODER(cp932) +{ + while (inleft > 0) { + unsigned char c = IN1, c2; + + REQUIRE_OUTBUF(1) + if (c <= 0x80) { + OUT1(c) + NEXT(1, 1) + continue; + } + else if (c >= 0xa0 && c <= 0xdf) { + if (c == 0xa0) + OUT1(0xf8f0) /* half-width katakana */ + else + OUT1(0xfec0 + c) + NEXT(1, 1) + continue; + } + else if (c >= 0xfd/* && c <= 0xff*/) { + /* Windows compatibility */ + OUT1(0xf8f1 - 0xfd + c) + NEXT(1, 1) + continue; + } + + REQUIRE_INBUF(2) + c2 = IN2; + + TRYMAP_DEC(cp932ext, **outbuf, c, c2); + else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){ + if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc) + return 2; + + c = (c < 0xe0 ? c - 0x81 : c - 0xc1); + c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41); + c = (2 * c + (c2 < 0x5e ? 0 : 1) + 0x21); + c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21; + + TRYMAP_DEC(jisx0208, **outbuf, c, c2); + else return 2; + } + else if (c >= 0xf0 && c <= 0xf9) { + if ((c2 >= 0x40 && c2 <= 0x7e) || + (c2 >= 0x80 && c2 <= 0xfc)) + OUT1(0xe000 + 188 * (c - 0xf0) + + (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41)) + else + return 2; + } + else + return 2; + + NEXT(2, 1) + } + + return 0; +} + + +/* + * EUC-JIS-2004 codec + */ + +ENCODER(euc_jis_2004) +{ + while (inleft > 0) { + ucs4_t c = IN1; + DBCHAR code; + int insize; + + if (c < 0x80) { + WRITE1(c) + NEXT(1, 1) + continue; + } + + DECODE_SURROGATE(c) + insize = GET_INSIZE(c); + + if (c <= 0xFFFF) { + EMULATE_JISX0213_2000_ENCODE_BMP(code, c) + else TRYMAP_ENC(jisx0213_bmp, code, c) { + if (code == MULTIC) { + if (inleft < 2) { + if (flags & MBENC_FLUSH) { + code = find_pairencmap( + (ucs2_t)c, 0, + jisx0213_pair_encmap, + JISX0213_ENCPAIRS); + if (code == DBCINV) + return 1; + } + else + return MBERR_TOOFEW; + } + else { + code = find_pairencmap( + (ucs2_t)c, (*inbuf)[1], + jisx0213_pair_encmap, + JISX0213_ENCPAIRS); + if (code == DBCINV) { + code = find_pairencmap( + (ucs2_t)c, 0, + jisx0213_pair_encmap, + JISX0213_ENCPAIRS); + if (code == DBCINV) + return 1; + } else + insize = 2; + } + } + } + else TRYMAP_ENC(jisxcommon, code, c); + else if (c >= 0xff61 && c <= 0xff9f) { + /* JIS X 0201 half-width katakana */ + WRITE2(0x8e, c - 0xfec0) + NEXT(1, 2) + continue; + } + else if (c == 0xff3c) + /* F/W REVERSE SOLIDUS (see NOTES) */ + code = 0x2140; + else if (c == 0xff5e) + /* F/W TILDE (see NOTES) */ + code = 0x2232; + else + return 1; + } + else if (c >> 16 == EMPBASE >> 16) { + EMULATE_JISX0213_2000_ENCODE_EMP(code, c) + else TRYMAP_ENC(jisx0213_emp, code, c & 0xffff); + else return insize; + } + else + return insize; + + if (code & 0x8000) { + /* Codeset 2 */ + WRITE3(0x8f, code >> 8, (code & 0xFF) | 0x80) + NEXT(insize, 3) + } else { + /* Codeset 1 */ + WRITE2((code >> 8) | 0x80, (code & 0xFF) | 0x80) + NEXT(insize, 2) + } + } + + return 0; +} + +DECODER(euc_jis_2004) +{ + while (inleft > 0) { + unsigned char c = IN1; + ucs4_t code; + + REQUIRE_OUTBUF(1) + + if (c < 0x80) { + OUT1(c) + NEXT(1, 1) + continue; + } + + if (c == 0x8e) { + /* JIS X 0201 half-width katakana */ + unsigned char c2; + + REQUIRE_INBUF(2) + c2 = IN2; + if (c2 >= 0xa1 && c2 <= 0xdf) { + OUT1(0xfec0 + c2) + NEXT(2, 1) + } + else + return 2; + } + else if (c == 0x8f) { + unsigned char c2, c3; + + REQUIRE_INBUF(3) + c2 = IN2 ^ 0x80; + c3 = IN3 ^ 0x80; + + /* JIS X 0213 Plane 2 or JIS X 0212 (see NOTES) */ + EMULATE_JISX0213_2000_DECODE_PLANE2(**outbuf, c2, c3) + else TRYMAP_DEC(jisx0213_2_bmp, **outbuf, c2, c3) ; + else TRYMAP_DEC(jisx0213_2_emp, code, c2, c3) { + WRITEUCS4(EMPBASE | code) + NEXT_IN(3) + continue; + } + else TRYMAP_DEC(jisx0212, **outbuf, c2, c3) ; + else return 3; + NEXT(3, 1) + } + else { + unsigned char c2; + + REQUIRE_INBUF(2) + c ^= 0x80; + c2 = IN2 ^ 0x80; + + /* JIS X 0213 Plane 1 */ + EMULATE_JISX0213_2000_DECODE_PLANE1(**outbuf, c, c2) + else if (c == 0x21 && c2 == 0x40) **outbuf = 0xff3c; + else if (c == 0x22 && c2 == 0x32) **outbuf = 0xff5e; + else TRYMAP_DEC(jisx0208, **outbuf, c, c2); + else TRYMAP_DEC(jisx0213_1_bmp, **outbuf, c, c2); + else TRYMAP_DEC(jisx0213_1_emp, code, c, c2) { + WRITEUCS4(EMPBASE | code) + NEXT_IN(2) + continue; + } + else TRYMAP_DEC(jisx0213_pair, code, c, c2) { + WRITE2(code >> 16, code & 0xffff) + NEXT(2, 2) + continue; + } + else return 2; + NEXT(2, 1) + } + } + + return 0; +} + + +/* + * EUC-JP codec + */ + +ENCODER(euc_jp) +{ + while (inleft > 0) { + Py_UNICODE c = IN1; + DBCHAR code; + + if (c < 0x80) { + WRITE1((unsigned char)c) + NEXT(1, 1) + continue; + } + + UCS4INVALID(c) + + TRYMAP_ENC(jisxcommon, code, c); + else if (c >= 0xff61 && c <= 0xff9f) { + /* JIS X 0201 half-width katakana */ + WRITE2(0x8e, c - 0xfec0) + NEXT(1, 2) + continue; + } +#ifndef STRICT_BUILD + else if (c == 0xff3c) /* FULL-WIDTH REVERSE SOLIDUS */ + code = 0x2140; + else if (c == 0xa5) { /* YEN SIGN */ + WRITE1(0x5c); + NEXT(1, 1) + continue; + } else if (c == 0x203e) { /* OVERLINE */ + WRITE1(0x7e); + NEXT(1, 1) + continue; + } +#endif + else + return 1; + + if (code & 0x8000) { + /* JIS X 0212 */ + WRITE3(0x8f, code >> 8, (code & 0xFF) | 0x80) + NEXT(1, 3) + } else { + /* JIS X 0208 */ + WRITE2((code >> 8) | 0x80, (code & 0xFF) | 0x80) + NEXT(1, 2) + } + } + + return 0; +} + +DECODER(euc_jp) +{ + while (inleft > 0) { + unsigned char c = IN1; + + REQUIRE_OUTBUF(1) + + if (c < 0x80) { + OUT1(c) + NEXT(1, 1) + continue; + } + + if (c == 0x8e) { + /* JIS X 0201 half-width katakana */ + unsigned char c2; + + REQUIRE_INBUF(2) + c2 = IN2; + if (c2 >= 0xa1 && c2 <= 0xdf) { + OUT1(0xfec0 + c2) + NEXT(2, 1) + } + else + return 2; + } + else if (c == 0x8f) { + unsigned char c2, c3; + + REQUIRE_INBUF(3) + c2 = IN2; + c3 = IN3; + /* JIS X 0212 */ + TRYMAP_DEC(jisx0212, **outbuf, c2 ^ 0x80, c3 ^ 0x80) { + NEXT(3, 1) + } + else + return 3; + } + else { + unsigned char c2; + + REQUIRE_INBUF(2) + c2 = IN2; + /* JIS X 0208 */ +#ifndef STRICT_BUILD + if (c == 0xa1 && c2 == 0xc0) + /* FULL-WIDTH REVERSE SOLIDUS */ + **outbuf = 0xff3c; + else +#endif + TRYMAP_DEC(jisx0208, **outbuf, + c ^ 0x80, c2 ^ 0x80) ; + else return 2; + NEXT(2, 1) + } + } + + return 0; +} + + +/* + * SHIFT_JIS codec + */ + +ENCODER(shift_jis) +{ + while (inleft > 0) { + Py_UNICODE c = IN1; + DBCHAR code; + unsigned char c1, c2; + +#ifdef STRICT_BUILD + JISX0201_R_ENCODE(c, code) +#else + if (c < 0x80) code = c; + else if (c == 0x00a5) code = 0x5c; /* YEN SIGN */ + else if (c == 0x203e) code = 0x7e; /* OVERLINE */ +#endif + else JISX0201_K_ENCODE(c, code) + else UCS4INVALID(c) + else code = NOCHAR; + + if (code < 0x80 || (code >= 0xa1 && code <= 0xdf)) { + REQUIRE_OUTBUF(1) + + OUT1((unsigned char)code) + NEXT(1, 1) + continue; + } + + REQUIRE_OUTBUF(2) + + if (code == NOCHAR) { + TRYMAP_ENC(jisxcommon, code, c); +#ifndef STRICT_BUILD + else if (c == 0xff3c) + code = 0x2140; /* FULL-WIDTH REVERSE SOLIDUS */ +#endif + else + return 1; + + if (code & 0x8000) /* MSB set: JIS X 0212 */ + return 1; + } + + c1 = code >> 8; + c2 = code & 0xff; + c2 = (((c1 - 0x21) & 1) ? 0x5e : 0) + (c2 - 0x21); + c1 = (c1 - 0x21) >> 1; + OUT1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1) + OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41) + NEXT(1, 2) + } + + return 0; +} + +DECODER(shift_jis) +{ + while (inleft > 0) { + unsigned char c = IN1; + + REQUIRE_OUTBUF(1) + +#ifdef STRICT_BUILD + JISX0201_R_DECODE(c, **outbuf) +#else + if (c < 0x80) **outbuf = c; +#endif + else JISX0201_K_DECODE(c, **outbuf) + else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){ + unsigned char c1, c2; + + REQUIRE_INBUF(2) + c2 = IN2; + if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc) + return 2; + + c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1); + c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41); + c1 = (2 * c1 + (c2 < 0x5e ? 0 : 1) + 0x21); + c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21; + +#ifndef STRICT_BUILD + if (c1 == 0x21 && c2 == 0x40) { + /* FULL-WIDTH REVERSE SOLIDUS */ + OUT1(0xff3c) + NEXT(2, 1) + continue; + } +#endif + TRYMAP_DEC(jisx0208, **outbuf, c1, c2) { + NEXT(2, 1) + continue; + } + else + return 2; + } + else + return 2; + + NEXT(1, 1) /* JIS X 0201 */ + } + + return 0; +} + + +/* + * SHIFT_JIS-2004 codec + */ + +ENCODER(shift_jis_2004) +{ + while (inleft > 0) { + ucs4_t c = IN1; + DBCHAR code = NOCHAR; + int c1, c2; + size_t insize; + + JISX0201_ENCODE(c, code) + else DECODE_SURROGATE(c) + + if (code < 0x80 || (code >= 0xa1 && code <= 0xdf)) { + WRITE1((unsigned char)code) + NEXT(1, 1) + continue; + } + + REQUIRE_OUTBUF(2) + insize = GET_INSIZE(c); + + if (code == NOCHAR) { + if (c <= 0xffff) { + EMULATE_JISX0213_2000_ENCODE_BMP(code, c) + else TRYMAP_ENC(jisx0213_bmp, code, c) { + if (code == MULTIC) { + if (inleft < 2) { + if (flags & MBENC_FLUSH) { + code = find_pairencmap + ((ucs2_t)c, 0, + jisx0213_pair_encmap, + JISX0213_ENCPAIRS); + if (code == DBCINV) + return 1; + } + else + return MBERR_TOOFEW; + } + else { + code = find_pairencmap( + (ucs2_t)c, IN2, + jisx0213_pair_encmap, + JISX0213_ENCPAIRS); + if (code == DBCINV) { + code = find_pairencmap( + (ucs2_t)c, 0, + jisx0213_pair_encmap, + JISX0213_ENCPAIRS); + if (code == DBCINV) + return 1; + } + else + insize = 2; + } + } + } + else TRYMAP_ENC(jisxcommon, code, c) { + /* abandon JIS X 0212 codes */ + if (code & 0x8000) + return 1; + } + else return 1; + } + else if (c >> 16 == EMPBASE >> 16) { + EMULATE_JISX0213_2000_ENCODE_EMP(code, c) + else TRYMAP_ENC(jisx0213_emp, code, c&0xffff); + else return insize; + } + else + return insize; + } + + c1 = code >> 8; + c2 = (code & 0xff) - 0x21; + + if (c1 & 0x80) { /* Plane 2 */ + if (c1 >= 0xee) c1 -= 0x87; + else if (c1 >= 0xac || c1 == 0xa8) c1 -= 0x49; + else c1 -= 0x43; + } + else /* Plane 1 */ + c1 -= 0x21; + + if (c1 & 1) c2 += 0x5e; + c1 >>= 1; + OUT1(c1 + (c1 < 0x1f ? 0x81 : 0xc1)) + OUT2(c2 + (c2 < 0x3f ? 0x40 : 0x41)) + + NEXT(insize, 2) + } + + return 0; +} + +DECODER(shift_jis_2004) +{ + while (inleft > 0) { + unsigned char c = IN1; + + REQUIRE_OUTBUF(1) + JISX0201_DECODE(c, **outbuf) + else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xfc)){ + unsigned char c1, c2 = IN2; + ucs4_t code; + + REQUIRE_INBUF(2) + if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc) + return 2; + + c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1); + c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41); + c1 = (2 * c1 + (c2 < 0x5e ? 0 : 1)); + c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21; + + if (c1 < 0x5e) { /* Plane 1 */ + c1 += 0x21; + EMULATE_JISX0213_2000_DECODE_PLANE1(**outbuf, + c1, c2) + else TRYMAP_DEC(jisx0208, **outbuf, c1, c2) { + NEXT_OUT(1) + } + else TRYMAP_DEC(jisx0213_1_bmp, **outbuf, + c1, c2) { + NEXT_OUT(1) + } + else TRYMAP_DEC(jisx0213_1_emp, code, c1, c2) { + WRITEUCS4(EMPBASE | code) + } + else TRYMAP_DEC(jisx0213_pair, code, c1, c2) { + WRITE2(code >> 16, code & 0xffff) + NEXT_OUT(2) + } + else + return 2; + NEXT_IN(2) + } + else { /* Plane 2 */ + if (c1 >= 0x67) c1 += 0x07; + else if (c1 >= 0x63 || c1 == 0x5f) c1 -= 0x37; + else c1 -= 0x3d; + + EMULATE_JISX0213_2000_DECODE_PLANE2(**outbuf, + c1, c2) + else TRYMAP_DEC(jisx0213_2_bmp, **outbuf, + c1, c2) ; + else TRYMAP_DEC(jisx0213_2_emp, code, c1, c2) { + WRITEUCS4(EMPBASE | code) + NEXT_IN(2) + continue; + } + else + return 2; + NEXT(2, 1) + } + continue; + } + else + return 2; + + NEXT(1, 1) /* JIS X 0201 */ + } + + return 0; +} + + +BEGIN_MAPPINGS_LIST + MAPPING_DECONLY(jisx0208) + MAPPING_DECONLY(jisx0212) + MAPPING_ENCONLY(jisxcommon) + MAPPING_DECONLY(jisx0213_1_bmp) + MAPPING_DECONLY(jisx0213_2_bmp) + MAPPING_ENCONLY(jisx0213_bmp) + MAPPING_DECONLY(jisx0213_1_emp) + MAPPING_DECONLY(jisx0213_2_emp) + MAPPING_ENCONLY(jisx0213_emp) + MAPPING_ENCDEC(jisx0213_pair) + MAPPING_ENCDEC(cp932ext) +END_MAPPINGS_LIST + +BEGIN_CODECS_LIST + CODEC_STATELESS(shift_jis) + CODEC_STATELESS(cp932) + CODEC_STATELESS(euc_jp) + CODEC_STATELESS(shift_jis_2004) + CODEC_STATELESS(euc_jis_2004) + { "euc_jisx0213", (void *)2000, NULL, _STATELESS_METHODS(euc_jis_2004) }, + { "shift_jisx0213", (void *)2000, NULL, _STATELESS_METHODS(shift_jis_2004) }, +END_CODECS_LIST + +I_AM_A_MODULE_FOR(jp) |