diff options
Diffstat (limited to 'Modules/cjkcodecs/_codecs_unicode.c')
-rw-r--r-- | Modules/cjkcodecs/_codecs_unicode.c | 560 |
1 files changed, 560 insertions, 0 deletions
diff --git a/Modules/cjkcodecs/_codecs_unicode.c b/Modules/cjkcodecs/_codecs_unicode.c new file mode 100644 index 0000000..b779a5b --- /dev/null +++ b/Modules/cjkcodecs/_codecs_unicode.c @@ -0,0 +1,560 @@ +/* + * _codecs_unicode.c: Codecs collection for Unicode encodings + * + * Written by Hye-Shik Chang <perky@FreeBSD.org> + * $CJKCodecs: _codecs_unicode.c,v 1.5 2004/06/27 21:41:15 perky Exp $ + */ + +#include "cjkcodecs.h" + +/* + * UTF-7 codec + */ + +#define SET_DIRECT 1 +#define SET_OPTIONAL 2 +#define SET_WHITESPACE 3 + +#define _D SET_DIRECT +#define _O SET_OPTIONAL +#define _W SET_WHITESPACE +static const char utf7_sets[128] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, _W, _W, 0, 0, _W, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + _W, _O, _O, _O, _O, _O, _O, _D, _D, _D, _O, 0, _D, _D, _D, 0, + _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _O, _O, _O, _O, _D, + _O, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, + _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _O, 0, _O, _O, _O, + _O, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, + _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _O, _O, _O, 0, 0, +}; +#undef _W +#undef _O +#undef _D + +#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" \ + "0123456789+/"[(n) & 0x3f]) +#define B64CHAR(c) (((c) >= 'A' && (c) <= 'Z') || \ + ((c) >= 'a' && (c) <= 'z') || \ + ((c) >= '0' && (c) <= '9') || \ + (c) == '+' || (c) == '/') +#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \ + (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4) + +#define UTF7_DENCODABLE_COMPATIBLE(c) (utf7_sets[c] != 0) +#define UTF7_DENCODABLE_STRICT(c) (utf7_sets[c] == SET_DIRECT || \ + utf7_sets[c] == SET_WHITESPACE) + +#define ESTATE_INITIALIZE(state) \ + ESTATE_SETSTAGE(state, 0) \ + ESTATE_CLEARSHIFTED(state) + +#define ESTATE_SETPENDING(state, v) (state)->c[0] = (v); +#define ESTATE_GETPENDING(state) (state)->c[0] + +#define ESTATE_SETSHIFTED(state) (state)->c[2] = 1; +#define ESTATE_ISSHIFTED(state) ((state)->c[2]) +#define ESTATE_CLEARSHIFTED(state) (state)->c[2] = 0; + +#define ESTATE_SETSTAGE(state, v) (state)->c[3] = (v); +#define ESTATE_GETSTAGE(state) ((state)->c[3]) + +ENCODER_INIT(utf_7) +{ + ESTATE_INITIALIZE(state) + return 0; +} + +ENCODER_RESET(utf_7) +{ + if (ESTATE_ISSHIFTED(state)) { + if (ESTATE_GETSTAGE(state) != 0) { + unsigned char oc; + + oc = B64(ESTATE_GETPENDING(state)); + WRITE2(oc, '-') + NEXT_OUT(2) + } + else { + WRITE1('-') + NEXT_OUT(1) + } + ESTATE_CLEARSHIFTED(state) + } + return 0; +} + +ENCODER(utf_7) +{ + while (inleft > 0) { + Py_UNICODE c1 = IN1, c2 = 0; + size_t insize = 1; + +#if Py_UNICODE_SIZE == 2 + if (c1 >> 10 == 0xd800 >> 10) { /* high surrogate */ + REQUIRE_INBUF(2) + if (IN2 >> 10 != 0xdc00 >> 10) /* low surrogate */ + return 2; /* invalid surrogate pair */ + c2 = IN2; + insize = 2; + } +#else + if (c1 > 0x10ffff) /* UTF-16 unencodable */ + return 1; + else if (c1 > 0xffff) { + c2 = 0xdc00 | ((c1 - 0x10000) & 0x3ff); + c1 = 0xd800 | ((c1 - 0x10000) >> 10); + } +#endif + + for (;;) { + unsigned char oc1, oc2, oc3; + + if (ESTATE_ISSHIFTED(state)) { + if (c1 < 128 && UTF7_DENCODABLE_STRICT(c1)) { + if (ESTATE_GETSTAGE(state) != 0) { + oc1 = B64(ESTATE_GETPENDING( + state)); + WRITE3(oc1, '-', + (unsigned char)c1) + NEXT_OUT(3) + } else { + WRITE2('-', + (unsigned char)c1) + NEXT_OUT(2) + } + ESTATE_CLEARSHIFTED(state) + } else { + switch (ESTATE_GETSTAGE(state)) { + case 0: + oc1 = c1 >> 10; + oc2 = (c1 >> 4) & 0x3f; + WRITE2(B64(oc1), B64(oc2)) + ESTATE_SETPENDING(state, + (c1 & 0x0f) << 2) + ESTATE_SETSTAGE(state, 2) + NEXT_OUT(2) + break; + case 1: + oc1 = ESTATE_GETPENDING(state) + | (c1 >> 12); + oc2 = (c1 >> 6) & 0x3f; + oc3 = c1 & 0x3f; + WRITE3(B64(oc1), B64(oc2), + B64(oc3)) + ESTATE_SETSTAGE(state, 0) + NEXT_OUT(3) + break; + case 2: + oc1 = ESTATE_GETPENDING(state) + | (c1 >> 14); + oc2 = (c1 >> 8) & 0x3f; + oc3 = (c1 >> 2) & 0x3f; + WRITE3(B64(oc1), B64(oc2), + B64(oc3)) + ESTATE_SETPENDING(state, + (c1 & 0x03) << 4) + ESTATE_SETSTAGE(state, 1) + NEXT_OUT(3) + break; + default: + return MBERR_INTERNAL; + } + } + } + else { + if (c1 < 128 && UTF7_DENCODABLE_STRICT(c1)) { + WRITE1((unsigned char)c1) + NEXT_OUT(1) + } + else if (c1 == '+') { + WRITE2('+', '-') + NEXT_OUT(2) + } + else { + oc1 = c1 >> 10; + oc2 = (c1 >> 4) & 0x3f; + WRITE3('+', B64(oc1), B64(oc2)) + ESTATE_SETPENDING(state, + (c1 & 0x0f) << 2) + ESTATE_SETSTAGE(state, 2) + ESTATE_SETSHIFTED(state) + NEXT_OUT(3) + } + } + + if (c2 != 0) { + c1 = c2; + c2 = 0; + } + else + break; + } + + NEXT_IN(insize) + } + + return 0; +} + +#define DSTATE_INITIALIZE(state) \ + DSTATE_SETBSTAGE(state, 0) \ + DSTATE_CLEARSHIFTED(state) \ + DSTATE_SETULENGTH(state, 0) \ + DSTATE_SETUPENDING1(state, 0) \ + DSTATE_SETUPENDING2(state, 0) + +/* XXX: Type-mixed usage of a state union may be not so portable. + * If you see any problem with this on your platfom. Please let + * me know. */ + +#define DSTATE_SETSHIFTED(state) (state)->c[0] = 1; +#define DSTATE_ISSHIFTED(state) ((state)->c[0]) +#define DSTATE_CLEARSHIFTED(state) (state)->c[0] = 0; + +#define DSTATE_SETBSTAGE(state, v) (state)->c[1] = (v); +#define DSTATE_GETBSTAGE(state) ((state)->c[1]) + +#define DSTATE_SETBPENDING(state, v) (state)->c[2] = (v); +#define DSTATE_GETBPENDING(state) ((state)->c[2]) + +#define DSTATE_SETULENGTH(state, v) (state)->c[3] = (v); +#define DSTATE_GETULENGTH(state) ((state)->c[3]) + +#define DSTATE_SETUPENDING1(state, v) (state)->u2[2] = (v); +#define DSTATE_GETUPENDING1(state) (state)->u2[2] + +#define DSTATE_SETUPENDING2(state, v) (state)->u2[3] = (v); +#define DSTATE_GETUPENDING2(state) (state)->u2[3] + +#define DSTATE_UAPPEND(state, v) \ + (state)->u2[(state)->c[3] > 1 ? 3 : 2] |= \ + ((state)->c[3] & 1) ? (v) : ((ucs2_t)(v)) << 8; \ + (state)->c[3]++; + +DECODER_INIT(utf_7) +{ + DSTATE_INITIALIZE(state) + return 0; +} + +static int +utf_7_flush(MultibyteCodec_State *state, + Py_UNICODE **outbuf, size_t *outleft) +{ + switch (DSTATE_GETULENGTH(state)) { + case 2: { + ucs2_t uc; + + uc = DSTATE_GETUPENDING1(state); +#if Py_UNICODE_SIZE == 4 + if (uc >> 10 == 0xd800 >> 10) + return MBERR_TOOFEW; +#endif + OUT1(uc) + (*outbuf)++; + (*outleft)--; + DSTATE_SETULENGTH(state, 0) + DSTATE_SETUPENDING1(state, 0) + break; + } +#if Py_UNICODE_SIZE == 4 + case 4: + if (DSTATE_GETUPENDING2(state) >> 10 != 0xdc00 >> 10) + return 1; + OUT1(0x10000 + (((ucs4_t)DSTATE_GETUPENDING1(state) - 0xd800) + << 10) + (DSTATE_GETUPENDING2(state) - 0xdc00)) + (*outbuf)++; + (*outleft)--; + DSTATE_SETULENGTH(state, 0) + DSTATE_SETUPENDING1(state, 0) + DSTATE_SETUPENDING2(state, 0) + break; +#endif + case 0: /* FALLTHROUGH */ + case 1: /* FALLTHROUGH */ + case 3: + return MBERR_TOOFEW; + default: + return MBERR_INTERNAL; + } + + return 0; +} + +DECODER_RESET(utf_7) +{ + DSTATE_INITIALIZE(state) + return 0; +} + +DECODER(utf_7) +{ + while (inleft > 0) { + unsigned char c = IN1; + int r; + + if (!DSTATE_ISSHIFTED(state)) { + if (c == '+') { + REQUIRE_INBUF(2) + if (inleft >= 2 && IN2 == '-') { + WRITE1('+') + NEXT(2, 1) + } + else { + DSTATE_SETSHIFTED(state) + NEXT_IN(1) + } + } + else if (c < 128 && UTF7_DENCODABLE_COMPATIBLE(c)) { + WRITE1(c) + NEXT(1, 1) + } + else + return 1; + } + else if (B64CHAR(c)) { + unsigned char tb; + + REQUIRE_OUTBUF(1) + c = UB64(c); + assert(DSTATE_GETULENGTH(state) < 4); + + switch (DSTATE_GETBSTAGE(state)) { + case 0: + DSTATE_SETBPENDING(state, c << 2) + DSTATE_SETBSTAGE(state, 1) + break; + case 1: + tb = DSTATE_GETBPENDING(state) | (c >> 4); + DSTATE_SETBPENDING(state, c << 4) + DSTATE_SETBSTAGE(state, 2) + DSTATE_UAPPEND(state, tb) + break; + case 2: + tb = DSTATE_GETBPENDING(state) | (c >> 2); + DSTATE_SETBPENDING(state, c << 6) + DSTATE_SETBSTAGE(state, 3) + DSTATE_UAPPEND(state, tb) + break; + case 3: + tb = DSTATE_GETBPENDING(state) | c; + DSTATE_SETBSTAGE(state, 0) + DSTATE_UAPPEND(state, tb) + break; + } + + r = utf_7_flush(state, outbuf, &outleft); + if (r != 0 && r != MBERR_TOOFEW) + return r; + NEXT_IN(1) + } + else if (c == '-' || UTF7_DENCODABLE_COMPATIBLE(c)) { + if (DSTATE_GETBSTAGE(state) != 0) { + DSTATE_UAPPEND(state, DSTATE_GETBSTAGE(state)) + DSTATE_SETBSTAGE(state, 0) + } + r = utf_7_flush(state, outbuf, &outleft); + if (r != 0 && r != MBERR_TOOFEW) + return r; + DSTATE_CLEARSHIFTED(state) + + if (c != '-') { + WRITE1(c) + NEXT_OUT(1) + } + NEXT_IN(1) + } + else + return 1; + } + + return 0; +} + + +/* + * UTF-8 codec + */ + +ENCODER(utf_8) +{ + while (inleft > 0) { + ucs4_t c = **inbuf; + size_t outsize, insize = 1; + + if (c < 0x80) outsize = 1; + else if (c < 0x800) outsize = 2; + else { +#if Py_UNICODE_SIZE == 2 + if (c >> 10 == 0xd800 >> 10) { /* high surrogate */ + if (inleft < 2) { + if (!(flags & MBENC_FLUSH)) + return MBERR_TOOFEW; + } + else if ((*inbuf)[1] >> 10 == 0xdc00 >> 10) { + /* low surrogate */ + c = 0x10000 + ((c - 0xd800) << 10) + + ((ucs4_t)((*inbuf)[1]) - 0xdc00); + insize = 2; + } + } +#endif + if (c < 0x10000) outsize = 3; + else if (c < 0x200000) outsize = 4; + else if (c < 0x4000000) outsize = 5; + else outsize = 6; + } + + REQUIRE_OUTBUF(outsize) + + switch (outsize) { + case 6: + (*outbuf)[5] = 0x80 | (c & 0x3f); + c = c >> 6; + c |= 0x4000000; + /* FALLTHROUGH */ + case 5: + (*outbuf)[4] = 0x80 | (c & 0x3f); + c = c >> 6; + c |= 0x200000; + /* FALLTHROUGH */ + case 4: + (*outbuf)[3] = 0x80 | (c & 0x3f); + c = c >> 6; + c |= 0x10000; + /* FALLTHROUGH */ + case 3: + (*outbuf)[2] = 0x80 | (c & 0x3f); + c = c >> 6; + c |= 0x800; + /* FALLTHROUGH */ + case 2: + (*outbuf)[1] = 0x80 | (c & 0x3f); + c = c >> 6; + c |= 0xc0; + /* FALLTHROUGH */ + case 1: + (*outbuf)[0] = c; + } + + NEXT(insize, outsize) + } + + return 0; +} + +DECODER(utf_8) +{ + while (inleft > 0) { + unsigned char c = **inbuf; + + REQUIRE_OUTBUF(1) + + if (c < 0x80) { + (*outbuf)[0] = (unsigned char)c; + NEXT(1, 1) + } + else if (c < 0xc2) { + return 1; + } + else if (c < 0xe0) { + unsigned char c2; + + REQUIRE_INBUF(2) + c2 = (*inbuf)[1]; + if (!((c2 ^ 0x80) < 0x40)) + return 2; + **outbuf = ((Py_UNICODE)(c & 0x1f) << 6) | + (Py_UNICODE)(c2 ^ 0x80); + NEXT(2, 1) + } + else if (c < 0xf0) { + unsigned char c2, c3; + + REQUIRE_INBUF(3) + c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; + if (!((c2 ^ 0x80) < 0x40 && + (c3 ^ 0x80) < 0x40 && (c >= 0xe1 || c2 >= 0xa0))) + return 3; + **outbuf = ((Py_UNICODE)(c & 0x0f) << 12) + | ((Py_UNICODE)(c2 ^ 0x80) << 6) + | (Py_UNICODE)(c3 ^ 0x80); + NEXT(3, 1) + } + else if (c < 0xf8) { + unsigned char c2, c3, c4; + ucs4_t code; + + REQUIRE_INBUF(4) + c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; + c4 = (*inbuf)[3]; + if (!((c2 ^ 0x80) < 0x40 && + (c3 ^ 0x80) < 0x40 && (c4 ^ 0x80) < 0x40 && + (c >= 0xf1 || c2 >= 0x90))) + return 4; + code = ((ucs4_t)(c & 0x07) << 18) + | ((ucs4_t)(c2 ^ 0x80) << 12) + | ((ucs4_t)(c3 ^ 0x80) << 6) + | (ucs4_t)(c4 ^ 0x80); + WRITEUCS4(code) + NEXT_IN(4) + } + else if (c < 0xfc) { + unsigned char c2, c3, c4, c5; + ucs4_t code; + + REQUIRE_INBUF(5) + c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; + c4 = (*inbuf)[3]; c5 = (*inbuf)[4]; + if (!((c2 ^ 0x80) < 0x40 && + (c3 ^ 0x80) < 0x40 && (c4 ^ 0x80) < 0x40 && + (c5 ^ 0x80) < 0x40 && (c >= 0xf9 || c2 >= 0x88))) + return 5; + code = ((ucs4_t)(c & 0x03) << 24) + | ((ucs4_t)(c2 ^ 0x80) << 18) + | ((ucs4_t)(c3 ^ 0x80) << 12) + | ((ucs4_t)(c4 ^ 0x80) << 6) + | (ucs4_t)(c5 ^ 0x80); + WRITEUCS4(code) + NEXT_IN(5) + } + else if (c < 0xff) { + unsigned char c2, c3, c4, c5, c6; + ucs4_t code; + + REQUIRE_INBUF(6) + c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; + c4 = (*inbuf)[3]; c5 = (*inbuf)[4]; + c6 = (*inbuf)[5]; + if (!((c2 ^ 0x80) < 0x40 && + (c3 ^ 0x80) < 0x40 && (c4 ^ 0x80) < 0x40 && + (c5 ^ 0x80) < 0x40 && (c6 ^ 0x80) < 0x40 && + (c >= 0xfd || c2 >= 0x84))) + return 6; + code = ((ucs4_t)(c & 0x01) << 30) + | ((ucs4_t)(c2 ^ 0x80) << 24) + | ((ucs4_t)(c3 ^ 0x80) << 18) + | ((ucs4_t)(c4 ^ 0x80) << 12) + | ((ucs4_t)(c5 ^ 0x80) << 6) + | (ucs4_t)(c6 ^ 0x80); + WRITEUCS4(code) + NEXT_IN(6) + } + else + return 1; + } + + return 0; +} + + +BEGIN_MAPPINGS_LIST +END_MAPPINGS_LIST + +BEGIN_CODECS_LIST + CODEC_STATEFUL(utf_7) + CODEC_STATELESS(utf_8) +END_CODECS_LIST + +I_AM_A_MODULE_FOR(unicode) |