diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2021-04-30 08:49:18 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2021-04-30 08:49:18 (GMT) |
commit | 07603e3b381387670e03489d397205372589c336 (patch) | |
tree | 65eee861d510dc0d9cb7916edd9792ec363c944f /generic/tclEncoding.c | |
parent | 092b62fc6bc9cb55ef045a8532fe211acf9f8ec1 (diff) | |
parent | 471313fea05798b7d188c6f69266b319236abef1 (diff) | |
download | tcl-07603e3b381387670e03489d397205372589c336.zip tcl-07603e3b381387670e03489d397205372589c336.tar.gz tcl-07603e3b381387670e03489d397205372589c336.tar.bz2 |
Merge 8.7. Remove "string bytelength" completely. Also fix some TIP #595 leftover testcases, which were skipped
Diffstat (limited to 'generic/tclEncoding.c')
-rw-r--r-- | generic/tclEncoding.c | 63 |
1 files changed, 52 insertions, 11 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 1c03fec..2201b3b 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -510,11 +510,12 @@ FillEncodingFileMap(void) *--------------------------------------------------------------------------- */ -/* This flags must not conflict with other TCL_ENCODING_* flags in tcl.h */ +/* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */ +/* Since TCL_ENCODING_MODIFIED is only used for utf-8/cesu-8 and + * TCL_ENCODING_LE is only used for utf-16/ucs-2. re-use the same value */ #define TCL_ENCODING_MODIFIED 0x20 /* Converting NULL bytes to 0xC0 0x80 */ -/* Since TCL_ENCODING_MODIFIED is only used for utf-8 and - * TCL_ENCODING_LE is only used for utf-16/ucs-2, re-use the same value */ #define TCL_ENCODING_LE TCL_ENCODING_MODIFIED /* Little-endian encoding */ +#define TCL_ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */ void TclInitEncodingSubsystem(void) @@ -556,7 +557,10 @@ TclInitEncodingSubsystem(void) type.fromUtfProc = UtfToUtfProc; type.freeProc = NULL; type.nullSize = 1; - type.clientData = NULL; + type.clientData = INT2PTR(TCL_ENCODING_UTF); + Tcl_CreateEncoding(&type); + type.clientData = INT2PTR(0); + type.encodingName = "cesu-8"; Tcl_CreateEncoding(&type); type.toUtfProc = Utf16ToUtfProc; @@ -1078,7 +1082,7 @@ Tcl_ExternalToUtfDString( flags = TCL_ENCODING_START | TCL_ENCODING_END; if (encodingPtr->toUtfProc == UtfToUtfProc) { - flags |= TCL_ENCODING_MODIFIED; + flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; } while (1) { @@ -1195,7 +1199,7 @@ Tcl_ExternalToUtf( dstLen--; } if (encodingPtr->toUtfProc == UtfToUtfProc) { - flags |= TCL_ENCODING_MODIFIED; + flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; } do { Tcl_EncodingState savedState = *statePtr; @@ -1275,6 +1279,7 @@ Tcl_UtfToExternalDString( &srcRead, &dstWrote, &dstChars); soFar = dst + dstWrote - Tcl_DStringValue(dstPtr); + src += srcRead; if (result != TCL_CONVERT_NOSPACE) { if (encodingPtr->nullSize == 2) { Tcl_DStringSetLength(dstPtr, soFar + 1); @@ -1284,7 +1289,6 @@ Tcl_UtfToExternalDString( } flags &= ~TCL_ENCODING_START; - src += srcRead; srcLen -= srcRead; if (Tcl_DStringLength(dstPtr) == 0) { Tcl_DStringSetLength(dstPtr, dstLen); @@ -2153,7 +2157,7 @@ UtfToUtfProc( dstStart = dst; flags |= PTR2INT(clientData); - dstEnd = dst + dstLen - TCL_UTF_MAX; + dstEnd = dst + dstLen - ((flags & TCL_ENCODING_UTF) ? TCL_UTF_MAX : 6); for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { @@ -2206,6 +2210,7 @@ UtfToUtfProc( dst += Tcl_UniCharToUtf(ch, dst); } else { int low; + const char *saveSrc = src; size_t len = TclUtfToUCS4(src, &ch); if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR) && (flags & TCL_ENCODING_MODIFIED)) { @@ -2213,7 +2218,17 @@ UtfToUtfProc( break; } src += len; - if ((ch | 0x7FF) == 0xDFFF) { + if (!(flags & TCL_ENCODING_UTF)) { + if (ch > 0xFFFF) { + /* CESU-8 6-byte sequence for chars > U+FFFF */ + ch -= 0x10000; + *dst++ = 0xED; + *dst++ = (char) (((ch >> 16) & 0x0F) | 0xA0); + *dst++ = (char) (((ch >> 10) & 0x3F) | 0x80); + ch = (ch & 0x0CFF) | 0xDC00; + } + goto cesu8; + } else if ((ch | 0x7FF) == 0xDFFF) { /* * A surrogate character is detected, handle especially. */ @@ -2222,6 +2237,15 @@ UtfToUtfProc( len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { + if (flags & TCL_ENCODING_STOPONERROR) { + result = TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } + if (!(flags & TCL_ENCODING_MODIFIED)) { + ch = 0xFFFD; + } + cesu8: *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF); *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF); *dst++ = (char) ((ch | 0x80) & 0xBF); @@ -2230,6 +2254,15 @@ UtfToUtfProc( src += len; dst += Tcl_UniCharToUtf(ch, dst); ch = low; + } else if (!Tcl_UniCharIsUnicode(ch)) { + if (flags & TCL_ENCODING_STOPONERROR) { + result = TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } + if (!(flags & TCL_ENCODING_MODIFIED)) { + ch = 0xFFFD; + } } dst += Tcl_UniCharToUtf(ch, dst); } @@ -2388,7 +2421,7 @@ UtfToUtf16Proc( { const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; int result, numChars; - int ch; + int ch, len; srcStart = src; srcEnd = src + srcLen; @@ -2416,7 +2449,15 @@ UtfToUtf16Proc( result = TCL_CONVERT_NOSPACE; break; } - src += TclUtfToUCS4(src, &ch); + len = TclUtfToUCS4(src, &ch); + if (!Tcl_UniCharIsUnicode(ch)) { + if (flags & TCL_ENCODING_STOPONERROR) { + result = TCL_CONVERT_UNKNOWN; + break; + } + ch = 0xFFFD; + } + src += len; if (flags & TCL_ENCODING_LE) { if (ch <= 0xFFFF) { *dst++ = (ch & 0xFF); |