From 1806e5755f1240beb778c171d3b7a2797276adf2 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sun, 28 Mar 2021 22:01:51 +0000 Subject: Make a start with CESU-8 encoder/decoder. Not finished yet --- generic/tclEncoding.c | 25 +++++++++++++++++-------- tests/encoding.test | 2 +- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index b69f7fc..a158269 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -513,7 +513,8 @@ FillEncodingFileMap(void) /* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */ #define TCL_ENCODING_MODIFIED 0x20 /* Converting NULL bytes to 0xC0 0x80 */ #define TCL_ENCODING_LE 0x80 /* Little-endian encoding, for ucs-2/utf-16 only */ -#define TCL_ENCODING_WTF 0x100 /* For wtf-8 encoding */ +#define TCL_ENCODING_WTF 0x100 /* For WTF-8 encoding, don't check for surrogates/noncharacters */ +#define TCL_ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */ void TclInitEncodingSubsystem(void) @@ -555,11 +556,14 @@ TclInitEncodingSubsystem(void) type.fromUtfProc = UtfToUtfProc; type.freeProc = NULL; type.nullSize = 1; - type.clientData = NULL; + type.clientData = INT2PTR(TCL_ENCODING_UTF); Tcl_CreateEncoding(&type); - type.clientData = INT2PTR(TCL_ENCODING_WTF); + type.clientData = INT2PTR(TCL_ENCODING_UTF|TCL_ENCODING_WTF); type.encodingName = "wtf-8"; Tcl_CreateEncoding(&type); + type.clientData = INT2PTR(0); + type.encodingName = "cesu-8"; + Tcl_CreateEncoding(&type); type.toUtfProc = Utf16ToUtfProc; type.fromUtfProc = UtfToUcs2Proc; @@ -1150,7 +1154,7 @@ Tcl_ExternalToUtfDString( srcLen = encodingPtr->lengthProc(src); } - flags = TCL_ENCODING_START | TCL_ENCODING_END | TCL_ENCODING_MODIFIED; + flags = TCL_ENCODING_START | TCL_ENCODING_END | TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; while (1) { result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen, @@ -1266,7 +1270,7 @@ Tcl_ExternalToUtf( dstLen--; } - flags |= TCL_ENCODING_MODIFIED; + flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; do { Tcl_EncodingState savedState = *statePtr; @@ -2221,8 +2225,8 @@ UtfToUtfProc( } dstStart = dst; - dstEnd = dst + dstLen - TCL_UTF_MAX; flags |= PTR2INT(clientData); + dstEnd = dst + dstLen - ((flags & TCL_ENCODING_UTF) ? TCL_UTF_MAX : 6); for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { @@ -2269,18 +2273,22 @@ UtfToUtfProc( src += 1; dst += Tcl_UniCharToUtf(ch, dst); } else { + int low; size_t len = TclUtfToUCS4(src, &ch); if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR)) { result = TCL_CONVERT_SYNTAX; break; } src += len; - if ((ch | 0x7FF) == 0xDFFF) { + if (!(flags & TCL_ENCODING_UTF)) { + // TODO : handle chars > U+FFFF + goto cesu8; + } else if ((ch | 0x7FF) == 0xDFFF) { /* * A surrogate character is detected, handle especially. */ - int low = ch; + low = ch; len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { @@ -2293,6 +2301,7 @@ UtfToUtfProc( ch = 0xFFFD; } } + cesu8: *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF); *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF); *dst++ = (char) ((ch | 0x80) & 0xBF); diff --git a/tests/encoding.test b/tests/encoding.test index 43aecbb..0ce009f 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -808,7 +808,7 @@ test encoding-28.0 {all encodings load} -body { llength $name } return $count -} -result [expr {[info exists ::tcl_precision] ? 90 : 89}] +} -result [expr {[info exists ::tcl_precision] ? 91 : 90}] runtests -- cgit v0.12