diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2021-03-21 09:27:07 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2021-03-21 09:27:07 (GMT) |
commit | dd9f3c1246178a54af32d25015227a5a1a7dced8 (patch) | |
tree | b40af99106caa9a73b68d26815b1b0e219ce4d96 /generic | |
parent | fff5060d945d7ee8f74e84dea9b0f703e2bae424 (diff) | |
download | tcl-dd9f3c1246178a54af32d25015227a5a1a7dced8.zip tcl-dd9f3c1246178a54af32d25015227a5a1a7dced8.tar.gz tcl-dd9f3c1246178a54af32d25015227a5a1a7dced8.tar.bz2 |
Implement TCL_ENCODING_STOPONERROR flag for UtfToUtf encoder/decoder. Backported from 8.7
Diffstat (limited to 'generic')
-rw-r--r-- | generic/tclEncoding.c | 32 |
1 files changed, 25 insertions, 7 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index f1529e1..012c954 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2345,15 +2345,24 @@ UtfToUtfProc( /* * Always check before using TclUtfToUniChar. Not doing can so * cause it run beyond the end of the buffer! If we happen such an - * incomplete char its bytes are made to represent themselves. + * incomplete char its bytes are made to represent themselves + * unless the user has explicitly asked to be told. */ + if (flags & TCL_ENCODING_STOPONERROR) { + result = TCL_CONVERT_MULTIBYTE; + break; + } *chPtr = UCHAR(*src); src += 1; dst += Tcl_UniCharToUtf(*chPtr, dst); } else { size_t len = TclUtfToUniChar(src, chPtr); - + if ((len < 2) && (flags & TCL_ENCODING_STOPONERROR) && (*chPtr != 0) + && ((*chPtr & ~0x7FF) != 0xD800)) { + result = TCL_CONVERT_SYNTAX; + break; + } src += len; if ((*chPtr & ~0x7FF) == 0xD800) { Tcl_UniChar low; @@ -2453,8 +2462,13 @@ UnicodeToUtfProc( result = TCL_CONVERT_MULTIBYTE; srcLen--; } - /* If last code point is a high surrogate, we cannot handle that yet */ - if ((srcLen >= 2) && ((src[srcLen - (clientData?1:2)] & 0xFC) == 0xD8)) { + + /* + * If last code point is a high surrogate, we cannot handle that yet. + */ + + if ((srcLen >= 2) && + ((src[srcLen - (clientData?1:2)] & 0xFC) == 0xD8)) { result = TCL_CONVERT_MULTIBYTE; srcLen-= 2; } @@ -2476,10 +2490,12 @@ UnicodeToUtfProc( } else { ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF); } + /* * Special case for 1-byte utf chars for speed. Make sure we work with * unsigned short-size data. */ + if (ch && ch < 0x80) { *dst++ = (ch & 0xFF); } else { @@ -3015,7 +3031,9 @@ Iso88591FromUtfProc( break; } #if TCL_UTF_MAX == 4 - if ((ch >= 0xD800) && (len < 3)) len = 4; + if ((ch >= 0xD800) && (len < 3)) { + len = 4; + } #endif /* * Plunge on, using '?' as a fallback character. @@ -3060,7 +3078,7 @@ TableFreeProc( ClientData clientData) /* TableEncodingData that specifies * encoding. */ { - TableEncodingData *dataPtr = (TableEncodingData *)clientData; + TableEncodingData *dataPtr = (TableEncodingData *) clientData; /* * Make sure we aren't freeing twice on shutdown. [Bug 219314] @@ -3118,7 +3136,7 @@ EscapeToUtfProc( * correspond to the bytes stored in the * output buffer. */ { - EscapeEncodingData *dataPtr = (EscapeEncodingData *)clientData; + EscapeEncodingData *dataPtr = (EscapeEncodingData *) clientData; const char *prefixBytes, *tablePrefixBytes, *srcStart, *srcEnd; const unsigned short *const *tableToUnicode; const Encoding *encodingPtr; |