diff options
author | Hye-Shik Chang <hyeshik@gmail.com> | 2007-08-20 06:49:18 (GMT) |
---|---|---|
committer | Hye-Shik Chang <hyeshik@gmail.com> | 2007-08-20 06:49:18 (GMT) |
commit | 2390104d81fb90c525fb353589837fc66837e5ff (patch) | |
tree | 873e11368d5a0759df39bfaaf98493c31ac5f9df /Modules | |
parent | c553f4290766e0a9b713af7c120b9c2a0af1df64 (diff) | |
download | cpython-2390104d81fb90c525fb353589837fc66837e5ff.zip cpython-2390104d81fb90c525fb353589837fc66837e5ff.tar.gz cpython-2390104d81fb90c525fb353589837fc66837e5ff.tar.bz2 |
Add cheot-ga-keut composed make-up sequence support in EUC-KR codec.
Diffstat (limited to 'Modules')
-rw-r--r-- | Modules/cjkcodecs/_codecs_kr.c | 111 |
1 files changed, 104 insertions, 7 deletions
diff --git a/Modules/cjkcodecs/_codecs_kr.c b/Modules/cjkcodecs/_codecs_kr.c index 2a95bbe..161967e 100644 --- a/Modules/cjkcodecs/_codecs_kr.c +++ b/Modules/cjkcodecs/_codecs_kr.c @@ -11,6 +11,26 @@ * EUC-KR codec */ +#define EUCKR_JAMO_FIRSTBYTE 0xA4 +#define EUCKR_JAMO_FILLER 0xD4 + +static const unsigned char u2cgk_choseong[19] = { + 0xa1, 0xa2, 0xa4, 0xa7, 0xa8, 0xa9, 0xb1, 0xb2, + 0xb3, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, + 0xbc, 0xbd, 0xbe +}; +static const unsigned char u2cgk_jungseong[21] = { + 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, + 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, + 0xcf, 0xd0, 0xd1, 0xd2, 0xd3 +}; +static const unsigned char u2cgk_jongseong[28] = { + 0xd4, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, + 0xb1, 0xb2, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xba, + 0xbb, 0xbc, 0xbd, 0xbe +}; + ENCODER(euc_kr) { while (inleft > 0) { @@ -28,17 +48,57 @@ ENCODER(euc_kr) TRYMAP_ENC(cp949, code, c); else return 1; - if (code & 0x8000) /* MSB set: CP949 */ - return 1; + if ((code & 0x8000) == 0) { + /* KS X 1001 coded character */ + OUT1((code >> 8) | 0x80) + OUT2((code & 0xFF) | 0x80) + NEXT(1, 2) + } + else { /* Mapping is found in CP949 extension, + * but we encode it in KS X 1001:1998 Annex 3, + * make-up sequence for EUC-KR. */ - OUT1((code >> 8) | 0x80) - OUT2((code & 0xFF) | 0x80) - NEXT(1, 2) + REQUIRE_OUTBUF(8) + + /* syllable composition precedence */ + OUT1(EUCKR_JAMO_FIRSTBYTE) + OUT2(EUCKR_JAMO_FILLER) + + /* All codepoints in CP949 extension are in unicode + * Hangul Syllable area. */ + assert(0xac00 <= c && c <= 0xd7a3); + c -= 0xac00; + + OUT3(EUCKR_JAMO_FIRSTBYTE) + OUT4(u2cgk_choseong[c / 588]) + NEXT_OUT(4) + + OUT1(EUCKR_JAMO_FIRSTBYTE) + OUT2(u2cgk_jungseong[(c / 28) % 21]) + OUT3(EUCKR_JAMO_FIRSTBYTE) + OUT4(u2cgk_jongseong[c % 28]) + NEXT(1, 4) + } } return 0; } +#define NONE 127 + +static const unsigned char cgk2u_choseong[] = { /* [A1, BE] */ + 0, 1, NONE, 2, NONE, NONE, 3, 4, + 5, NONE, NONE, NONE, NONE, NONE, NONE, NONE, + 6, 7, 8, NONE, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18 +}; +static const unsigned char cgk2u_jongseong[] = { /* [A1, BE] */ + 1, 2, 3, 4, 5, 6, 7, NONE, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, NONE, 18, 19, 20, 21, 22, + NONE, 23, 24, 25, 26, 27 +}; + DECODER(euc_kr) { while (inleft > 0) { @@ -54,13 +114,50 @@ DECODER(euc_kr) REQUIRE_INBUF(2) - TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80) { + if (c == EUCKR_JAMO_FIRSTBYTE && + IN2 == EUCKR_JAMO_FILLER) { + /* KS X 1001:1998 Annex 3 make-up sequence */ + DBCHAR cho, jung, jong; + + REQUIRE_INBUF(8) + if ((*inbuf)[2] != EUCKR_JAMO_FIRSTBYTE || + (*inbuf)[4] != EUCKR_JAMO_FIRSTBYTE || + (*inbuf)[6] != EUCKR_JAMO_FIRSTBYTE) + return 8; + + c = (*inbuf)[3]; + if (0xa1 <= c && c <= 0xbe) + cho = cgk2u_choseong[c - 0xa1]; + else + cho = NONE; + + c = (*inbuf)[5]; + jung = (0xbf <= c && c <= 0xd3) ? c - 0xbf : NONE; + + c = (*inbuf)[7]; + if (c == EUCKR_JAMO_FILLER) + jong = 0; + else if (0xa1 <= c && c <= 0xbe) + jong = cgk2u_jongseong[c - 0xa1]; + else + jong = NONE; + + if (cho == NONE || jung == NONE || jong == NONE) + return 8; + + OUT1(0xac00 + cho*588 + jung*28 + jong); + NEXT(8, 1) + } + else TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80) { NEXT(2, 1) - } else return 2; + } + else + return 2; } return 0; } +#undef NONE /* |