summaryrefslogtreecommitdiffstats
path: root/Modules/cjkcodecs/_codecs_unicode.c
diff options
context:
space:
mode:
Diffstat (limited to 'Modules/cjkcodecs/_codecs_unicode.c')
-rw-r--r--Modules/cjkcodecs/_codecs_unicode.c560
1 files changed, 560 insertions, 0 deletions
diff --git a/Modules/cjkcodecs/_codecs_unicode.c b/Modules/cjkcodecs/_codecs_unicode.c
new file mode 100644
index 0000000..b779a5b
--- /dev/null
+++ b/Modules/cjkcodecs/_codecs_unicode.c
@@ -0,0 +1,560 @@
+/*
+ * _codecs_unicode.c: Codecs collection for Unicode encodings
+ *
+ * Written by Hye-Shik Chang <perky@FreeBSD.org>
+ * $CJKCodecs: _codecs_unicode.c,v 1.5 2004/06/27 21:41:15 perky Exp $
+ */
+
+#include "cjkcodecs.h"
+
+/*
+ * UTF-7 codec
+ */
+
+#define SET_DIRECT 1
+#define SET_OPTIONAL 2
+#define SET_WHITESPACE 3
+
+#define _D SET_DIRECT
+#define _O SET_OPTIONAL
+#define _W SET_WHITESPACE
+static const char utf7_sets[128] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, _W, _W, 0, 0, _W, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ _W, _O, _O, _O, _O, _O, _O, _D, _D, _D, _O, 0, _D, _D, _D, 0,
+ _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _O, _O, _O, _O, _D,
+ _O, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D,
+ _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _O, 0, _O, _O, _O,
+ _O, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D,
+ _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _O, _O, _O, 0, 0,
+};
+#undef _W
+#undef _O
+#undef _D
+
+#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" \
+ "0123456789+/"[(n) & 0x3f])
+#define B64CHAR(c) (((c) >= 'A' && (c) <= 'Z') || \
+ ((c) >= 'a' && (c) <= 'z') || \
+ ((c) >= '0' && (c) <= '9') || \
+ (c) == '+' || (c) == '/')
+#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
+ (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
+
+#define UTF7_DENCODABLE_COMPATIBLE(c) (utf7_sets[c] != 0)
+#define UTF7_DENCODABLE_STRICT(c) (utf7_sets[c] == SET_DIRECT || \
+ utf7_sets[c] == SET_WHITESPACE)
+
+#define ESTATE_INITIALIZE(state) \
+ ESTATE_SETSTAGE(state, 0) \
+ ESTATE_CLEARSHIFTED(state)
+
+#define ESTATE_SETPENDING(state, v) (state)->c[0] = (v);
+#define ESTATE_GETPENDING(state) (state)->c[0]
+
+#define ESTATE_SETSHIFTED(state) (state)->c[2] = 1;
+#define ESTATE_ISSHIFTED(state) ((state)->c[2])
+#define ESTATE_CLEARSHIFTED(state) (state)->c[2] = 0;
+
+#define ESTATE_SETSTAGE(state, v) (state)->c[3] = (v);
+#define ESTATE_GETSTAGE(state) ((state)->c[3])
+
+ENCODER_INIT(utf_7)
+{
+ ESTATE_INITIALIZE(state)
+ return 0;
+}
+
+ENCODER_RESET(utf_7)
+{
+ if (ESTATE_ISSHIFTED(state)) {
+ if (ESTATE_GETSTAGE(state) != 0) {
+ unsigned char oc;
+
+ oc = B64(ESTATE_GETPENDING(state));
+ WRITE2(oc, '-')
+ NEXT_OUT(2)
+ }
+ else {
+ WRITE1('-')
+ NEXT_OUT(1)
+ }
+ ESTATE_CLEARSHIFTED(state)
+ }
+ return 0;
+}
+
+ENCODER(utf_7)
+{
+ while (inleft > 0) {
+ Py_UNICODE c1 = IN1, c2 = 0;
+ size_t insize = 1;
+
+#if Py_UNICODE_SIZE == 2
+ if (c1 >> 10 == 0xd800 >> 10) { /* high surrogate */
+ REQUIRE_INBUF(2)
+ if (IN2 >> 10 != 0xdc00 >> 10) /* low surrogate */
+ return 2; /* invalid surrogate pair */
+ c2 = IN2;
+ insize = 2;
+ }
+#else
+ if (c1 > 0x10ffff) /* UTF-16 unencodable */
+ return 1;
+ else if (c1 > 0xffff) {
+ c2 = 0xdc00 | ((c1 - 0x10000) & 0x3ff);
+ c1 = 0xd800 | ((c1 - 0x10000) >> 10);
+ }
+#endif
+
+ for (;;) {
+ unsigned char oc1, oc2, oc3;
+
+ if (ESTATE_ISSHIFTED(state)) {
+ if (c1 < 128 && UTF7_DENCODABLE_STRICT(c1)) {
+ if (ESTATE_GETSTAGE(state) != 0) {
+ oc1 = B64(ESTATE_GETPENDING(
+ state));
+ WRITE3(oc1, '-',
+ (unsigned char)c1)
+ NEXT_OUT(3)
+ } else {
+ WRITE2('-',
+ (unsigned char)c1)
+ NEXT_OUT(2)
+ }
+ ESTATE_CLEARSHIFTED(state)
+ } else {
+ switch (ESTATE_GETSTAGE(state)) {
+ case 0:
+ oc1 = c1 >> 10;
+ oc2 = (c1 >> 4) & 0x3f;
+ WRITE2(B64(oc1), B64(oc2))
+ ESTATE_SETPENDING(state,
+ (c1 & 0x0f) << 2)
+ ESTATE_SETSTAGE(state, 2)
+ NEXT_OUT(2)
+ break;
+ case 1:
+ oc1 = ESTATE_GETPENDING(state)
+ | (c1 >> 12);
+ oc2 = (c1 >> 6) & 0x3f;
+ oc3 = c1 & 0x3f;
+ WRITE3(B64(oc1), B64(oc2),
+ B64(oc3))
+ ESTATE_SETSTAGE(state, 0)
+ NEXT_OUT(3)
+ break;
+ case 2:
+ oc1 = ESTATE_GETPENDING(state)
+ | (c1 >> 14);
+ oc2 = (c1 >> 8) & 0x3f;
+ oc3 = (c1 >> 2) & 0x3f;
+ WRITE3(B64(oc1), B64(oc2),
+ B64(oc3))
+ ESTATE_SETPENDING(state,
+ (c1 & 0x03) << 4)
+ ESTATE_SETSTAGE(state, 1)
+ NEXT_OUT(3)
+ break;
+ default:
+ return MBERR_INTERNAL;
+ }
+ }
+ }
+ else {
+ if (c1 < 128 && UTF7_DENCODABLE_STRICT(c1)) {
+ WRITE1((unsigned char)c1)
+ NEXT_OUT(1)
+ }
+ else if (c1 == '+') {
+ WRITE2('+', '-')
+ NEXT_OUT(2)
+ }
+ else {
+ oc1 = c1 >> 10;
+ oc2 = (c1 >> 4) & 0x3f;
+ WRITE3('+', B64(oc1), B64(oc2))
+ ESTATE_SETPENDING(state,
+ (c1 & 0x0f) << 2)
+ ESTATE_SETSTAGE(state, 2)
+ ESTATE_SETSHIFTED(state)
+ NEXT_OUT(3)
+ }
+ }
+
+ if (c2 != 0) {
+ c1 = c2;
+ c2 = 0;
+ }
+ else
+ break;
+ }
+
+ NEXT_IN(insize)
+ }
+
+ return 0;
+}
+
+#define DSTATE_INITIALIZE(state) \
+ DSTATE_SETBSTAGE(state, 0) \
+ DSTATE_CLEARSHIFTED(state) \
+ DSTATE_SETULENGTH(state, 0) \
+ DSTATE_SETUPENDING1(state, 0) \
+ DSTATE_SETUPENDING2(state, 0)
+
+/* XXX: Type-mixed usage of a state union may be not so portable.
+ * If you see any problem with this on your platfom. Please let
+ * me know. */
+
+#define DSTATE_SETSHIFTED(state) (state)->c[0] = 1;
+#define DSTATE_ISSHIFTED(state) ((state)->c[0])
+#define DSTATE_CLEARSHIFTED(state) (state)->c[0] = 0;
+
+#define DSTATE_SETBSTAGE(state, v) (state)->c[1] = (v);
+#define DSTATE_GETBSTAGE(state) ((state)->c[1])
+
+#define DSTATE_SETBPENDING(state, v) (state)->c[2] = (v);
+#define DSTATE_GETBPENDING(state) ((state)->c[2])
+
+#define DSTATE_SETULENGTH(state, v) (state)->c[3] = (v);
+#define DSTATE_GETULENGTH(state) ((state)->c[3])
+
+#define DSTATE_SETUPENDING1(state, v) (state)->u2[2] = (v);
+#define DSTATE_GETUPENDING1(state) (state)->u2[2]
+
+#define DSTATE_SETUPENDING2(state, v) (state)->u2[3] = (v);
+#define DSTATE_GETUPENDING2(state) (state)->u2[3]
+
+#define DSTATE_UAPPEND(state, v) \
+ (state)->u2[(state)->c[3] > 1 ? 3 : 2] |= \
+ ((state)->c[3] & 1) ? (v) : ((ucs2_t)(v)) << 8; \
+ (state)->c[3]++;
+
+DECODER_INIT(utf_7)
+{
+ DSTATE_INITIALIZE(state)
+ return 0;
+}
+
+static int
+utf_7_flush(MultibyteCodec_State *state,
+ Py_UNICODE **outbuf, size_t *outleft)
+{
+ switch (DSTATE_GETULENGTH(state)) {
+ case 2: {
+ ucs2_t uc;
+
+ uc = DSTATE_GETUPENDING1(state);
+#if Py_UNICODE_SIZE == 4
+ if (uc >> 10 == 0xd800 >> 10)
+ return MBERR_TOOFEW;
+#endif
+ OUT1(uc)
+ (*outbuf)++;
+ (*outleft)--;
+ DSTATE_SETULENGTH(state, 0)
+ DSTATE_SETUPENDING1(state, 0)
+ break;
+ }
+#if Py_UNICODE_SIZE == 4
+ case 4:
+ if (DSTATE_GETUPENDING2(state) >> 10 != 0xdc00 >> 10)
+ return 1;
+ OUT1(0x10000 + (((ucs4_t)DSTATE_GETUPENDING1(state) - 0xd800)
+ << 10) + (DSTATE_GETUPENDING2(state) - 0xdc00))
+ (*outbuf)++;
+ (*outleft)--;
+ DSTATE_SETULENGTH(state, 0)
+ DSTATE_SETUPENDING1(state, 0)
+ DSTATE_SETUPENDING2(state, 0)
+ break;
+#endif
+ case 0: /* FALLTHROUGH */
+ case 1: /* FALLTHROUGH */
+ case 3:
+ return MBERR_TOOFEW;
+ default:
+ return MBERR_INTERNAL;
+ }
+
+ return 0;
+}
+
+DECODER_RESET(utf_7)
+{
+ DSTATE_INITIALIZE(state)
+ return 0;
+}
+
+DECODER(utf_7)
+{
+ while (inleft > 0) {
+ unsigned char c = IN1;
+ int r;
+
+ if (!DSTATE_ISSHIFTED(state)) {
+ if (c == '+') {
+ REQUIRE_INBUF(2)
+ if (inleft >= 2 && IN2 == '-') {
+ WRITE1('+')
+ NEXT(2, 1)
+ }
+ else {
+ DSTATE_SETSHIFTED(state)
+ NEXT_IN(1)
+ }
+ }
+ else if (c < 128 && UTF7_DENCODABLE_COMPATIBLE(c)) {
+ WRITE1(c)
+ NEXT(1, 1)
+ }
+ else
+ return 1;
+ }
+ else if (B64CHAR(c)) {
+ unsigned char tb;
+
+ REQUIRE_OUTBUF(1)
+ c = UB64(c);
+ assert(DSTATE_GETULENGTH(state) < 4);
+
+ switch (DSTATE_GETBSTAGE(state)) {
+ case 0:
+ DSTATE_SETBPENDING(state, c << 2)
+ DSTATE_SETBSTAGE(state, 1)
+ break;
+ case 1:
+ tb = DSTATE_GETBPENDING(state) | (c >> 4);
+ DSTATE_SETBPENDING(state, c << 4)
+ DSTATE_SETBSTAGE(state, 2)
+ DSTATE_UAPPEND(state, tb)
+ break;
+ case 2:
+ tb = DSTATE_GETBPENDING(state) | (c >> 2);
+ DSTATE_SETBPENDING(state, c << 6)
+ DSTATE_SETBSTAGE(state, 3)
+ DSTATE_UAPPEND(state, tb)
+ break;
+ case 3:
+ tb = DSTATE_GETBPENDING(state) | c;
+ DSTATE_SETBSTAGE(state, 0)
+ DSTATE_UAPPEND(state, tb)
+ break;
+ }
+
+ r = utf_7_flush(state, outbuf, &outleft);
+ if (r != 0 && r != MBERR_TOOFEW)
+ return r;
+ NEXT_IN(1)
+ }
+ else if (c == '-' || UTF7_DENCODABLE_COMPATIBLE(c)) {
+ if (DSTATE_GETBSTAGE(state) != 0) {
+ DSTATE_UAPPEND(state, DSTATE_GETBSTAGE(state))
+ DSTATE_SETBSTAGE(state, 0)
+ }
+ r = utf_7_flush(state, outbuf, &outleft);
+ if (r != 0 && r != MBERR_TOOFEW)
+ return r;
+ DSTATE_CLEARSHIFTED(state)
+
+ if (c != '-') {
+ WRITE1(c)
+ NEXT_OUT(1)
+ }
+ NEXT_IN(1)
+ }
+ else
+ return 1;
+ }
+
+ return 0;
+}
+
+
+/*
+ * UTF-8 codec
+ */
+
+ENCODER(utf_8)
+{
+ while (inleft > 0) {
+ ucs4_t c = **inbuf;
+ size_t outsize, insize = 1;
+
+ if (c < 0x80) outsize = 1;
+ else if (c < 0x800) outsize = 2;
+ else {
+#if Py_UNICODE_SIZE == 2
+ if (c >> 10 == 0xd800 >> 10) { /* high surrogate */
+ if (inleft < 2) {
+ if (!(flags & MBENC_FLUSH))
+ return MBERR_TOOFEW;
+ }
+ else if ((*inbuf)[1] >> 10 == 0xdc00 >> 10) {
+ /* low surrogate */
+ c = 0x10000 + ((c - 0xd800) << 10) +
+ ((ucs4_t)((*inbuf)[1]) - 0xdc00);
+ insize = 2;
+ }
+ }
+#endif
+ if (c < 0x10000) outsize = 3;
+ else if (c < 0x200000) outsize = 4;
+ else if (c < 0x4000000) outsize = 5;
+ else outsize = 6;
+ }
+
+ REQUIRE_OUTBUF(outsize)
+
+ switch (outsize) {
+ case 6:
+ (*outbuf)[5] = 0x80 | (c & 0x3f);
+ c = c >> 6;
+ c |= 0x4000000;
+ /* FALLTHROUGH */
+ case 5:
+ (*outbuf)[4] = 0x80 | (c & 0x3f);
+ c = c >> 6;
+ c |= 0x200000;
+ /* FALLTHROUGH */
+ case 4:
+ (*outbuf)[3] = 0x80 | (c & 0x3f);
+ c = c >> 6;
+ c |= 0x10000;
+ /* FALLTHROUGH */
+ case 3:
+ (*outbuf)[2] = 0x80 | (c & 0x3f);
+ c = c >> 6;
+ c |= 0x800;
+ /* FALLTHROUGH */
+ case 2:
+ (*outbuf)[1] = 0x80 | (c & 0x3f);
+ c = c >> 6;
+ c |= 0xc0;
+ /* FALLTHROUGH */
+ case 1:
+ (*outbuf)[0] = c;
+ }
+
+ NEXT(insize, outsize)
+ }
+
+ return 0;
+}
+
+DECODER(utf_8)
+{
+ while (inleft > 0) {
+ unsigned char c = **inbuf;
+
+ REQUIRE_OUTBUF(1)
+
+ if (c < 0x80) {
+ (*outbuf)[0] = (unsigned char)c;
+ NEXT(1, 1)
+ }
+ else if (c < 0xc2) {
+ return 1;
+ }
+ else if (c < 0xe0) {
+ unsigned char c2;
+
+ REQUIRE_INBUF(2)
+ c2 = (*inbuf)[1];
+ if (!((c2 ^ 0x80) < 0x40))
+ return 2;
+ **outbuf = ((Py_UNICODE)(c & 0x1f) << 6) |
+ (Py_UNICODE)(c2 ^ 0x80);
+ NEXT(2, 1)
+ }
+ else if (c < 0xf0) {
+ unsigned char c2, c3;
+
+ REQUIRE_INBUF(3)
+ c2 = (*inbuf)[1]; c3 = (*inbuf)[2];
+ if (!((c2 ^ 0x80) < 0x40 &&
+ (c3 ^ 0x80) < 0x40 && (c >= 0xe1 || c2 >= 0xa0)))
+ return 3;
+ **outbuf = ((Py_UNICODE)(c & 0x0f) << 12)
+ | ((Py_UNICODE)(c2 ^ 0x80) << 6)
+ | (Py_UNICODE)(c3 ^ 0x80);
+ NEXT(3, 1)
+ }
+ else if (c < 0xf8) {
+ unsigned char c2, c3, c4;
+ ucs4_t code;
+
+ REQUIRE_INBUF(4)
+ c2 = (*inbuf)[1]; c3 = (*inbuf)[2];
+ c4 = (*inbuf)[3];
+ if (!((c2 ^ 0x80) < 0x40 &&
+ (c3 ^ 0x80) < 0x40 && (c4 ^ 0x80) < 0x40 &&
+ (c >= 0xf1 || c2 >= 0x90)))
+ return 4;
+ code = ((ucs4_t)(c & 0x07) << 18)
+ | ((ucs4_t)(c2 ^ 0x80) << 12)
+ | ((ucs4_t)(c3 ^ 0x80) << 6)
+ | (ucs4_t)(c4 ^ 0x80);
+ WRITEUCS4(code)
+ NEXT_IN(4)
+ }
+ else if (c < 0xfc) {
+ unsigned char c2, c3, c4, c5;
+ ucs4_t code;
+
+ REQUIRE_INBUF(5)
+ c2 = (*inbuf)[1]; c3 = (*inbuf)[2];
+ c4 = (*inbuf)[3]; c5 = (*inbuf)[4];
+ if (!((c2 ^ 0x80) < 0x40 &&
+ (c3 ^ 0x80) < 0x40 && (c4 ^ 0x80) < 0x40 &&
+ (c5 ^ 0x80) < 0x40 && (c >= 0xf9 || c2 >= 0x88)))
+ return 5;
+ code = ((ucs4_t)(c & 0x03) << 24)
+ | ((ucs4_t)(c2 ^ 0x80) << 18)
+ | ((ucs4_t)(c3 ^ 0x80) << 12)
+ | ((ucs4_t)(c4 ^ 0x80) << 6)
+ | (ucs4_t)(c5 ^ 0x80);
+ WRITEUCS4(code)
+ NEXT_IN(5)
+ }
+ else if (c < 0xff) {
+ unsigned char c2, c3, c4, c5, c6;
+ ucs4_t code;
+
+ REQUIRE_INBUF(6)
+ c2 = (*inbuf)[1]; c3 = (*inbuf)[2];
+ c4 = (*inbuf)[3]; c5 = (*inbuf)[4];
+ c6 = (*inbuf)[5];
+ if (!((c2 ^ 0x80) < 0x40 &&
+ (c3 ^ 0x80) < 0x40 && (c4 ^ 0x80) < 0x40 &&
+ (c5 ^ 0x80) < 0x40 && (c6 ^ 0x80) < 0x40 &&
+ (c >= 0xfd || c2 >= 0x84)))
+ return 6;
+ code = ((ucs4_t)(c & 0x01) << 30)
+ | ((ucs4_t)(c2 ^ 0x80) << 24)
+ | ((ucs4_t)(c3 ^ 0x80) << 18)
+ | ((ucs4_t)(c4 ^ 0x80) << 12)
+ | ((ucs4_t)(c5 ^ 0x80) << 6)
+ | (ucs4_t)(c6 ^ 0x80);
+ WRITEUCS4(code)
+ NEXT_IN(6)
+ }
+ else
+ return 1;
+ }
+
+ return 0;
+}
+
+
+BEGIN_MAPPINGS_LIST
+END_MAPPINGS_LIST
+
+BEGIN_CODECS_LIST
+ CODEC_STATEFUL(utf_7)
+ CODEC_STATELESS(utf_8)
+END_CODECS_LIST
+
+I_AM_A_MODULE_FOR(unicode)