diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2018-04-23 14:56:44 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2018-04-23 14:56:44 (GMT) |
commit | 9acb063268f48f19e3c67a877f2c83e15fd1019d (patch) | |
tree | 2f95f0ec894aaa2fb2879a81d466cd5a50908ca3 /generic/tclEncoding.c | |
parent | 8e06fd796be19c40e0e82a7d9c9e54d34e975504 (diff) | |
download | tcl-9acb063268f48f19e3c67a877f2c83e15fd1019d.zip tcl-9acb063268f48f19e3c67a877f2c83e15fd1019d.tar.gz tcl-9acb063268f48f19e3c67a877f2c83e15fd1019d.tar.bz2 |
Add some state to encodings, so we can do better surrogate handling for TCL_UTF_MAX >= 4. Backported from TIP #389.
Diffstat (limited to 'generic/tclEncoding.c')
-rw-r--r-- | generic/tclEncoding.c | 79 |
1 files changed, 53 insertions, 26 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 2548b73..6b440e7 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2296,8 +2296,11 @@ UtfToUtfProc( const char *srcStart, *srcEnd, *srcClose; const char *dstStart, *dstEnd; int result, numChars, charLimit = INT_MAX; - Tcl_UniChar ch = 0; + Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr; + if (flags & TCL_ENCODING_START) { + *statePtr = 0; + } result = TCL_OK; srcStart = src; @@ -2349,12 +2352,19 @@ UtfToUtfProc( * incomplete char its bytes are made to represent themselves. */ - ch = (unsigned char) *src; + *chPtr = (unsigned char) *src; src += 1; - dst += Tcl_UniCharToUtf(ch, dst); + dst += Tcl_UniCharToUtf(*chPtr, dst); } else { - src += TclUtfToUniChar(src, &ch); - dst += Tcl_UniCharToUtf(ch, dst); + int len = TclUtfToUniChar(src, chPtr); + src += len; + dst += Tcl_UniCharToUtf(*chPtr, dst); +#if TCL_UTF_MAX == 4 + if (!len) { + src += TclUtfToUniChar(src, chPtr); + dst += Tcl_UniCharToUtf(*chPtr, dst); + } +#endif } } @@ -2410,8 +2420,11 @@ UnicodeToUtfProc( const char *srcStart, *srcEnd; const char *dstEnd, *dstStart; int result, numChars, charLimit = INT_MAX; - Tcl_UniChar ch = 0; + Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr; + if (flags & TCL_ENCODING_START) { + *statePtr = 0; + } if (flags & TCL_ENCODING_CHAR_LIMIT) { charLimit = *dstCharsPtr; } @@ -2439,11 +2452,11 @@ UnicodeToUtfProc( * Tcl_UniChar-size data. */ - ch = *(Tcl_UniChar *)src; - if (ch && ch < 0x80) { - *dst++ = (ch & 0xFF); + *chPtr = *(Tcl_UniChar *)src; + if (*chPtr && *chPtr < 0x80) { + *dst++ = (*chPtr & 0xFF); } else { - dst += Tcl_UniCharToUtf(ch, dst); + dst += Tcl_UniCharToUtf(*chPtr, dst); } src += sizeof(Tcl_UniChar); } @@ -2500,8 +2513,11 @@ UtfToUnicodeProc( { const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; int result, numChars; - Tcl_UniChar ch = 0; + Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr; + if (flags & TCL_ENCODING_START) { + *statePtr = 0; + } srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; @@ -2527,7 +2543,7 @@ UtfToUnicodeProc( result = TCL_CONVERT_NOSPACE; break; } - src += TclUtfToUniChar(src, &ch); + src += TclUtfToUniChar(src, chPtr); /* * Need to handle this in a way that won't cause misalignment by @@ -2536,23 +2552,23 @@ UtfToUnicodeProc( #ifdef WORDS_BIGENDIAN #if TCL_UTF_MAX > 4 - *dst++ = (ch >> 24); - *dst++ = ((ch >> 16) & 0xFF); - *dst++ = ((ch >> 8) & 0xFF); - *dst++ = (ch & 0xFF); + *dst++ = (*chPtr >> 24); + *dst++ = ((*chPtr >> 16) & 0xFF); + *dst++ = ((*chPtr >> 8) & 0xFF); + *dst++ = (*chPtr & 0xFF); #else - *dst++ = (ch >> 8); - *dst++ = (ch & 0xFF); + *dst++ = (*chPtr >> 8); + *dst++ = (*chPtr & 0xFF); #endif #else #if TCL_UTF_MAX > 4 - *dst++ = (ch & 0xFF); - *dst++ = ((ch >> 8) & 0xFF); - *dst++ = ((ch >> 16) & 0xFF); - *dst++ = (ch >> 24); + *dst++ = (*chPtr & 0xFF); + *dst++ = ((*chPtr >> 8) & 0xFF); + *dst++ = ((*chPtr >> 16) & 0xFF); + *dst++ = (*chPtr >> 24); #else - *dst++ = (ch & 0xFF); - *dst++ = (ch >> 8); + *dst++ = (*chPtr & 0xFF); + *dst++ = (*chPtr >> 8); #endif #endif } @@ -2754,7 +2770,7 @@ TableFromUtfProc( } len = TclUtfToUniChar(src, &ch); -#if TCL_UTF_MAX > 3 +#if TCL_UTF_MAX > 4 /* * This prevents a crash condition. More evaluation is required for * full support of int Tcl_UniChar. [Bug 1004065] @@ -2763,6 +2779,10 @@ TableFromUtfProc( if (ch & 0xffff0000) { word = 0; } else +#elif TCL_UTF_MAX == 4 + if (!len) { + word = 0; + } else #endif word = fromUnicode[(ch >> 8)][ch & 0xff]; @@ -2960,11 +2980,18 @@ Iso88591FromUtfProc( * Check for illegal characters. */ - if (ch > 0xff) { + if (ch > 0xff +#if TCL_UTF_MAX == 4 + || !len +#endif + ) { if (flags & TCL_ENCODING_STOPONERROR) { result = TCL_CONVERT_UNKNOWN; break; } +#if TCL_UTF_MAX == 4 + if (!len) len = 4; +#endif /* * Plunge on, using '?' as a fallback character. |