summaryrefslogtreecommitdiffstats
path: root/generic/tclEncoding.c
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2018-04-23 14:56:44 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2018-04-23 14:56:44 (GMT)
commit9acb063268f48f19e3c67a877f2c83e15fd1019d (patch)
tree2f95f0ec894aaa2fb2879a81d466cd5a50908ca3 /generic/tclEncoding.c
parent8e06fd796be19c40e0e82a7d9c9e54d34e975504 (diff)
downloadtcl-9acb063268f48f19e3c67a877f2c83e15fd1019d.zip
tcl-9acb063268f48f19e3c67a877f2c83e15fd1019d.tar.gz
tcl-9acb063268f48f19e3c67a877f2c83e15fd1019d.tar.bz2
Add some state to encodings, so we can do better surrogate handling for TCL_UTF_MAX >= 4. Backported from TIP #389.
Diffstat (limited to 'generic/tclEncoding.c')
-rw-r--r--generic/tclEncoding.c79
1 files changed, 53 insertions, 26 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 2548b73..6b440e7 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2296,8 +2296,11 @@ UtfToUtfProc(
const char *srcStart, *srcEnd, *srcClose;
const char *dstStart, *dstEnd;
int result, numChars, charLimit = INT_MAX;
- Tcl_UniChar ch = 0;
+ Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr;
+ if (flags & TCL_ENCODING_START) {
+ *statePtr = 0;
+ }
result = TCL_OK;
srcStart = src;
@@ -2349,12 +2352,19 @@ UtfToUtfProc(
* incomplete char its bytes are made to represent themselves.
*/
- ch = (unsigned char) *src;
+ *chPtr = (unsigned char) *src;
src += 1;
- dst += Tcl_UniCharToUtf(ch, dst);
+ dst += Tcl_UniCharToUtf(*chPtr, dst);
} else {
- src += TclUtfToUniChar(src, &ch);
- dst += Tcl_UniCharToUtf(ch, dst);
+ int len = TclUtfToUniChar(src, chPtr);
+ src += len;
+ dst += Tcl_UniCharToUtf(*chPtr, dst);
+#if TCL_UTF_MAX == 4
+ if (!len) {
+ src += TclUtfToUniChar(src, chPtr);
+ dst += Tcl_UniCharToUtf(*chPtr, dst);
+ }
+#endif
}
}
@@ -2410,8 +2420,11 @@ UnicodeToUtfProc(
const char *srcStart, *srcEnd;
const char *dstEnd, *dstStart;
int result, numChars, charLimit = INT_MAX;
- Tcl_UniChar ch = 0;
+ Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr;
+ if (flags & TCL_ENCODING_START) {
+ *statePtr = 0;
+ }
if (flags & TCL_ENCODING_CHAR_LIMIT) {
charLimit = *dstCharsPtr;
}
@@ -2439,11 +2452,11 @@ UnicodeToUtfProc(
* Tcl_UniChar-size data.
*/
- ch = *(Tcl_UniChar *)src;
- if (ch && ch < 0x80) {
- *dst++ = (ch & 0xFF);
+ *chPtr = *(Tcl_UniChar *)src;
+ if (*chPtr && *chPtr < 0x80) {
+ *dst++ = (*chPtr & 0xFF);
} else {
- dst += Tcl_UniCharToUtf(ch, dst);
+ dst += Tcl_UniCharToUtf(*chPtr, dst);
}
src += sizeof(Tcl_UniChar);
}
@@ -2500,8 +2513,11 @@ UtfToUnicodeProc(
{
const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
int result, numChars;
- Tcl_UniChar ch = 0;
+ Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr;
+ if (flags & TCL_ENCODING_START) {
+ *statePtr = 0;
+ }
srcStart = src;
srcEnd = src + srcLen;
srcClose = srcEnd;
@@ -2527,7 +2543,7 @@ UtfToUnicodeProc(
result = TCL_CONVERT_NOSPACE;
break;
}
- src += TclUtfToUniChar(src, &ch);
+ src += TclUtfToUniChar(src, chPtr);
/*
* Need to handle this in a way that won't cause misalignment by
@@ -2536,23 +2552,23 @@ UtfToUnicodeProc(
#ifdef WORDS_BIGENDIAN
#if TCL_UTF_MAX > 4
- *dst++ = (ch >> 24);
- *dst++ = ((ch >> 16) & 0xFF);
- *dst++ = ((ch >> 8) & 0xFF);
- *dst++ = (ch & 0xFF);
+ *dst++ = (*chPtr >> 24);
+ *dst++ = ((*chPtr >> 16) & 0xFF);
+ *dst++ = ((*chPtr >> 8) & 0xFF);
+ *dst++ = (*chPtr & 0xFF);
#else
- *dst++ = (ch >> 8);
- *dst++ = (ch & 0xFF);
+ *dst++ = (*chPtr >> 8);
+ *dst++ = (*chPtr & 0xFF);
#endif
#else
#if TCL_UTF_MAX > 4
- *dst++ = (ch & 0xFF);
- *dst++ = ((ch >> 8) & 0xFF);
- *dst++ = ((ch >> 16) & 0xFF);
- *dst++ = (ch >> 24);
+ *dst++ = (*chPtr & 0xFF);
+ *dst++ = ((*chPtr >> 8) & 0xFF);
+ *dst++ = ((*chPtr >> 16) & 0xFF);
+ *dst++ = (*chPtr >> 24);
#else
- *dst++ = (ch & 0xFF);
- *dst++ = (ch >> 8);
+ *dst++ = (*chPtr & 0xFF);
+ *dst++ = (*chPtr >> 8);
#endif
#endif
}
@@ -2754,7 +2770,7 @@ TableFromUtfProc(
}
len = TclUtfToUniChar(src, &ch);
-#if TCL_UTF_MAX > 3
+#if TCL_UTF_MAX > 4
/*
* This prevents a crash condition. More evaluation is required for
* full support of int Tcl_UniChar. [Bug 1004065]
@@ -2763,6 +2779,10 @@ TableFromUtfProc(
if (ch & 0xffff0000) {
word = 0;
} else
+#elif TCL_UTF_MAX == 4
+ if (!len) {
+ word = 0;
+ } else
#endif
word = fromUnicode[(ch >> 8)][ch & 0xff];
@@ -2960,11 +2980,18 @@ Iso88591FromUtfProc(
* Check for illegal characters.
*/
- if (ch > 0xff) {
+ if (ch > 0xff
+#if TCL_UTF_MAX == 4
+ || !len
+#endif
+ ) {
if (flags & TCL_ENCODING_STOPONERROR) {
result = TCL_CONVERT_UNKNOWN;
break;
}
+#if TCL_UTF_MAX == 4
+ if (!len) len = 4;
+#endif
/*
* Plunge on, using '?' as a fallback character.