diff options
| author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2020-05-01 14:20:21 (GMT) |
|---|---|---|
| committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2020-05-01 14:20:21 (GMT) |
| commit | c0381fd712c458f3b4ee476e00d0857bec4cf5cc (patch) | |
| tree | 073e28ab81f4331a3b1b9dceea126f1e278bce76 /generic/tclEncoding.c | |
| parent | a3d900f322a935841622ac0c8d81a45c91a16e65 (diff) | |
| parent | 9eaf82b745ac07bc55f7238813c449fc5a447cf8 (diff) | |
| download | tcl-c0381fd712c458f3b4ee476e00d0857bec4cf5cc.zip tcl-c0381fd712c458f3b4ee476e00d0857bec4cf5cc.tar.gz tcl-c0381fd712c458f3b4ee476e00d0857bec4cf5cc.tar.bz2 | |
Fix first part of [ed29806baf]: Tcl_UtfToUniChar reads more than TCL_UTF_MAX bytes.
Tcl_UtfToUniChar() now never reads more than TCL_UTF_MAX bytes any more.
Since the UtfToUtf encoder/decoder now uses TclUtfToUCS4() it doesn't join 2 surrogates as 2 x 3-byte sequences any more. Actually, it shouldn't, because such sequences are invalid UTF-8.
Therefore, added the ucs2 constraint to testcase encoding-15.4. Let's see how TIP #573 goes, this TIP should make this change official.
Other callers of Tcl_UtfToUniChar() needs to be revised for the same problem. Most callers will need to change Tcl_UtfToUniChar() -> TclUtfToUCS4() and Tcl_UtfCharComplete() -> TclUCS4Complete(), but that's not done yet.
Diffstat (limited to 'generic/tclEncoding.c')
| -rw-r--r-- | generic/tclEncoding.c | 20 |
1 files changed, 11 insertions, 9 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 4789b7f..444f99e 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2300,7 +2300,7 @@ UtfToUtfProc( const char *srcStart, *srcEnd, *srcClose; const char *dstStart, *dstEnd; int result, numChars, charLimit = INT_MAX; - Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr; + int *chPtr = (int *) statePtr; if (flags & TCL_ENCODING_START) { *statePtr = 0; @@ -2321,7 +2321,7 @@ UtfToUtfProc( dstEnd = dst + dstLen - TCL_UTF_MAX; for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { - if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { + if ((src > srcClose) && (!TclUCS4Complete(src, srcEnd - src))) { /* * If there is more string to follow, this will ensure that the * last UTF-8 character in the source buffer hasn't been cut off. @@ -2341,6 +2341,7 @@ UtfToUtfProc( */ *dst++ = *src++; + *chPtr = 0; /* reset surrogate handling */ } else if (pureNullMode == 1 && UCHAR(*src) == 0xC0 && (src + 1 < srcEnd) && UCHAR(*(src+1)) == 0x80) { /* @@ -2348,24 +2349,25 @@ UtfToUtfProc( */ *dst++ = 0; + *chPtr = 0; /* reset surrogate handling */ src += 2; - } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { + } else if (!TclUCS4Complete(src, srcEnd - src)) { /* - * Always check before using TclUtfToUniChar. Not doing can so + * Always check before using TclUtfToUCS4. Not doing can so * cause it run beyond the end of the buffer! If we happen such an * incomplete char its bytes are made to represent themselves. */ - *chPtr = (unsigned char) *src; + *chPtr = UCHAR(*src); src += 1; dst += Tcl_UniCharToUtf(*chPtr, dst); } else { - src += TclUtfToUniChar(src, chPtr); + src += TclUtfToUCS4(src, chPtr); if ((*chPtr | 0x7FF) == 0xDFFF) { /* A surrogate character is detected, handle especially */ - Tcl_UniChar low = *chPtr; - size_t len = (src <= srcEnd-3) ? Tcl_UtfToUniChar(src, &low) : 0; - if (((low | 0x3FF) != 0xDFFF) || (*chPtr & 0x400)) { + int low = *chPtr; + size_t len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; + if (((low & ~0x3FF) != 0xC00) || (*chPtr & 0x400)) { *dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF); *dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF); *dst++ = (char) ((*chPtr | 0x80) & 0xBF); |
