summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2021-03-28 22:01:51 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2021-03-28 22:01:51 (GMT)
commit1806e5755f1240beb778c171d3b7a2797276adf2 (patch)
tree3492753f91aebc3dad4fc838c63e2bb951ccc87f
parentce82e0c22deb54edd228a3deba63ff5a3c7a62e5 (diff)
downloadtcl-1806e5755f1240beb778c171d3b7a2797276adf2.zip
tcl-1806e5755f1240beb778c171d3b7a2797276adf2.tar.gz
tcl-1806e5755f1240beb778c171d3b7a2797276adf2.tar.bz2
Make a start with CESU-8 encoder/decoder. Not finished yet
-rw-r--r--generic/tclEncoding.c25
-rw-r--r--tests/encoding.test2
2 files changed, 18 insertions, 9 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index b69f7fc..a158269 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -513,7 +513,8 @@ FillEncodingFileMap(void)
/* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */
#define TCL_ENCODING_MODIFIED 0x20 /* Converting NULL bytes to 0xC0 0x80 */
#define TCL_ENCODING_LE 0x80 /* Little-endian encoding, for ucs-2/utf-16 only */
-#define TCL_ENCODING_WTF 0x100 /* For wtf-8 encoding */
+#define TCL_ENCODING_WTF 0x100 /* For WTF-8 encoding, don't check for surrogates/noncharacters */
+#define TCL_ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */
void
TclInitEncodingSubsystem(void)
@@ -555,11 +556,14 @@ TclInitEncodingSubsystem(void)
type.fromUtfProc = UtfToUtfProc;
type.freeProc = NULL;
type.nullSize = 1;
- type.clientData = NULL;
+ type.clientData = INT2PTR(TCL_ENCODING_UTF);
Tcl_CreateEncoding(&type);
- type.clientData = INT2PTR(TCL_ENCODING_WTF);
+ type.clientData = INT2PTR(TCL_ENCODING_UTF|TCL_ENCODING_WTF);
type.encodingName = "wtf-8";
Tcl_CreateEncoding(&type);
+ type.clientData = INT2PTR(0);
+ type.encodingName = "cesu-8";
+ Tcl_CreateEncoding(&type);
type.toUtfProc = Utf16ToUtfProc;
type.fromUtfProc = UtfToUcs2Proc;
@@ -1150,7 +1154,7 @@ Tcl_ExternalToUtfDString(
srcLen = encodingPtr->lengthProc(src);
}
- flags = TCL_ENCODING_START | TCL_ENCODING_END | TCL_ENCODING_MODIFIED;
+ flags = TCL_ENCODING_START | TCL_ENCODING_END | TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF;
while (1) {
result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen,
@@ -1266,7 +1270,7 @@ Tcl_ExternalToUtf(
dstLen--;
}
- flags |= TCL_ENCODING_MODIFIED;
+ flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF;
do {
Tcl_EncodingState savedState = *statePtr;
@@ -2221,8 +2225,8 @@ UtfToUtfProc(
}
dstStart = dst;
- dstEnd = dst + dstLen - TCL_UTF_MAX;
flags |= PTR2INT(clientData);
+ dstEnd = dst + dstLen - ((flags & TCL_ENCODING_UTF) ? TCL_UTF_MAX : 6);
for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
@@ -2269,18 +2273,22 @@ UtfToUtfProc(
src += 1;
dst += Tcl_UniCharToUtf(ch, dst);
} else {
+ int low;
size_t len = TclUtfToUCS4(src, &ch);
if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR)) {
result = TCL_CONVERT_SYNTAX;
break;
}
src += len;
- if ((ch | 0x7FF) == 0xDFFF) {
+ if (!(flags & TCL_ENCODING_UTF)) {
+ // TODO : handle chars > U+FFFF
+ goto cesu8;
+ } else if ((ch | 0x7FF) == 0xDFFF) {
/*
* A surrogate character is detected, handle especially.
*/
- int low = ch;
+ low = ch;
len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) {
@@ -2293,6 +2301,7 @@ UtfToUtfProc(
ch = 0xFFFD;
}
}
+ cesu8:
*dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF);
*dst++ = (char) (((ch >> 6) | 0x80) & 0xBF);
*dst++ = (char) ((ch | 0x80) & 0xBF);
diff --git a/tests/encoding.test b/tests/encoding.test
index 43aecbb..0ce009f 100644
--- a/tests/encoding.test
+++ b/tests/encoding.test
@@ -808,7 +808,7 @@ test encoding-28.0 {all encodings load} -body {
llength $name
}
return $count
-} -result [expr {[info exists ::tcl_precision] ? 90 : 89}]
+} -result [expr {[info exists ::tcl_precision] ? 91 : 90}]
runtests