summaryrefslogtreecommitdiffstats
path: root/generic/tclEncoding.c
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2021-04-30 08:49:18 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2021-04-30 08:49:18 (GMT)
commit07603e3b381387670e03489d397205372589c336 (patch)
tree65eee861d510dc0d9cb7916edd9792ec363c944f /generic/tclEncoding.c
parent092b62fc6bc9cb55ef045a8532fe211acf9f8ec1 (diff)
parent471313fea05798b7d188c6f69266b319236abef1 (diff)
downloadtcl-07603e3b381387670e03489d397205372589c336.zip
tcl-07603e3b381387670e03489d397205372589c336.tar.gz
tcl-07603e3b381387670e03489d397205372589c336.tar.bz2
Merge 8.7. Remove "string bytelength" completely. Also fix some TIP #595 leftover testcases, which were skipped
Diffstat (limited to 'generic/tclEncoding.c')
-rw-r--r--generic/tclEncoding.c63
1 files changed, 52 insertions, 11 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 1c03fec..2201b3b 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -510,11 +510,12 @@ FillEncodingFileMap(void)
*---------------------------------------------------------------------------
*/
-/* This flags must not conflict with other TCL_ENCODING_* flags in tcl.h */
+/* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */
+/* Since TCL_ENCODING_MODIFIED is only used for utf-8/cesu-8 and
+ * TCL_ENCODING_LE is only used for utf-16/ucs-2. re-use the same value */
#define TCL_ENCODING_MODIFIED 0x20 /* Converting NULL bytes to 0xC0 0x80 */
-/* Since TCL_ENCODING_MODIFIED is only used for utf-8 and
- * TCL_ENCODING_LE is only used for utf-16/ucs-2, re-use the same value */
#define TCL_ENCODING_LE TCL_ENCODING_MODIFIED /* Little-endian encoding */
+#define TCL_ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */
void
TclInitEncodingSubsystem(void)
@@ -556,7 +557,10 @@ TclInitEncodingSubsystem(void)
type.fromUtfProc = UtfToUtfProc;
type.freeProc = NULL;
type.nullSize = 1;
- type.clientData = NULL;
+ type.clientData = INT2PTR(TCL_ENCODING_UTF);
+ Tcl_CreateEncoding(&type);
+ type.clientData = INT2PTR(0);
+ type.encodingName = "cesu-8";
Tcl_CreateEncoding(&type);
type.toUtfProc = Utf16ToUtfProc;
@@ -1078,7 +1082,7 @@ Tcl_ExternalToUtfDString(
flags = TCL_ENCODING_START | TCL_ENCODING_END;
if (encodingPtr->toUtfProc == UtfToUtfProc) {
- flags |= TCL_ENCODING_MODIFIED;
+ flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF;
}
while (1) {
@@ -1195,7 +1199,7 @@ Tcl_ExternalToUtf(
dstLen--;
}
if (encodingPtr->toUtfProc == UtfToUtfProc) {
- flags |= TCL_ENCODING_MODIFIED;
+ flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF;
}
do {
Tcl_EncodingState savedState = *statePtr;
@@ -1275,6 +1279,7 @@ Tcl_UtfToExternalDString(
&srcRead, &dstWrote, &dstChars);
soFar = dst + dstWrote - Tcl_DStringValue(dstPtr);
+ src += srcRead;
if (result != TCL_CONVERT_NOSPACE) {
if (encodingPtr->nullSize == 2) {
Tcl_DStringSetLength(dstPtr, soFar + 1);
@@ -1284,7 +1289,6 @@ Tcl_UtfToExternalDString(
}
flags &= ~TCL_ENCODING_START;
- src += srcRead;
srcLen -= srcRead;
if (Tcl_DStringLength(dstPtr) == 0) {
Tcl_DStringSetLength(dstPtr, dstLen);
@@ -2153,7 +2157,7 @@ UtfToUtfProc(
dstStart = dst;
flags |= PTR2INT(clientData);
- dstEnd = dst + dstLen - TCL_UTF_MAX;
+ dstEnd = dst + dstLen - ((flags & TCL_ENCODING_UTF) ? TCL_UTF_MAX : 6);
for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
@@ -2206,6 +2210,7 @@ UtfToUtfProc(
dst += Tcl_UniCharToUtf(ch, dst);
} else {
int low;
+ const char *saveSrc = src;
size_t len = TclUtfToUCS4(src, &ch);
if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR)
&& (flags & TCL_ENCODING_MODIFIED)) {
@@ -2213,7 +2218,17 @@ UtfToUtfProc(
break;
}
src += len;
- if ((ch | 0x7FF) == 0xDFFF) {
+ if (!(flags & TCL_ENCODING_UTF)) {
+ if (ch > 0xFFFF) {
+ /* CESU-8 6-byte sequence for chars > U+FFFF */
+ ch -= 0x10000;
+ *dst++ = 0xED;
+ *dst++ = (char) (((ch >> 16) & 0x0F) | 0xA0);
+ *dst++ = (char) (((ch >> 10) & 0x3F) | 0x80);
+ ch = (ch & 0x0CFF) | 0xDC00;
+ }
+ goto cesu8;
+ } else if ((ch | 0x7FF) == 0xDFFF) {
/*
* A surrogate character is detected, handle especially.
*/
@@ -2222,6 +2237,15 @@ UtfToUtfProc(
len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) {
+ if (flags & TCL_ENCODING_STOPONERROR) {
+ result = TCL_CONVERT_UNKNOWN;
+ src = saveSrc;
+ break;
+ }
+ if (!(flags & TCL_ENCODING_MODIFIED)) {
+ ch = 0xFFFD;
+ }
+ cesu8:
*dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF);
*dst++ = (char) (((ch >> 6) | 0x80) & 0xBF);
*dst++ = (char) ((ch | 0x80) & 0xBF);
@@ -2230,6 +2254,15 @@ UtfToUtfProc(
src += len;
dst += Tcl_UniCharToUtf(ch, dst);
ch = low;
+ } else if (!Tcl_UniCharIsUnicode(ch)) {
+ if (flags & TCL_ENCODING_STOPONERROR) {
+ result = TCL_CONVERT_UNKNOWN;
+ src = saveSrc;
+ break;
+ }
+ if (!(flags & TCL_ENCODING_MODIFIED)) {
+ ch = 0xFFFD;
+ }
}
dst += Tcl_UniCharToUtf(ch, dst);
}
@@ -2388,7 +2421,7 @@ UtfToUtf16Proc(
{
const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
int result, numChars;
- int ch;
+ int ch, len;
srcStart = src;
srcEnd = src + srcLen;
@@ -2416,7 +2449,15 @@ UtfToUtf16Proc(
result = TCL_CONVERT_NOSPACE;
break;
}
- src += TclUtfToUCS4(src, &ch);
+ len = TclUtfToUCS4(src, &ch);
+ if (!Tcl_UniCharIsUnicode(ch)) {
+ if (flags & TCL_ENCODING_STOPONERROR) {
+ result = TCL_CONVERT_UNKNOWN;
+ break;
+ }
+ ch = 0xFFFD;
+ }
+ src += len;
if (flags & TCL_ENCODING_LE) {
if (ch <= 0xFFFF) {
*dst++ = (ch & 0xFF);