diff options
| author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2020-05-01 13:38:22 (GMT) |
|---|---|---|
| committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2020-05-01 13:38:22 (GMT) |
| commit | 9eaf82b745ac07bc55f7238813c449fc5a447cf8 (patch) | |
| tree | f421a15863ac0ae1148013bf95a401b8eeba0357 /generic/tclEncoding.c | |
| parent | ba28f4892362a62309d8809b4dc5099a888a9f91 (diff) | |
| parent | 62c00ac54a6f93ad1324d7e7aa5ef43623ca2415 (diff) | |
| download | tcl-9eaf82b745ac07bc55f7238813c449fc5a447cf8.zip tcl-9eaf82b745ac07bc55f7238813c449fc5a447cf8.tar.gz tcl-9eaf82b745ac07bc55f7238813c449fc5a447cf8.tar.bz2 | |
Fix first part of [ed29806baf]: Tcl_UtfToUniChar reads more than TCL_UTF_MAX bytes.
Tcl_UtfToUniChar() now never reads more than TCL_UTF_MAX bytes any more. The UtfToUtf encoder/decoder is adapted to do attitional checks (more tricky than in Tcl 8.7, since we want compatibility with earlier 8.6 releases).
Other callers of Tcl_UtfToUniChar() needs to be revised for the same problem. Most callers will need to change Tcl_UtfToUniChar() -> TclUtfToUCS4() and Tcl_UtfCharComplete() -> TclUCS4Complete(), but that's not done yet.
Diffstat (limited to 'generic/tclEncoding.c')
| -rw-r--r-- | generic/tclEncoding.c | 38 |
1 files changed, 26 insertions, 12 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 6ab0510..1584de0 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2331,6 +2331,7 @@ UtfToUtfProc( */ *dst++ = *src++; + *chPtr = 0; /* reset surrogate handling */ } else if (pureNullMode == 1 && UCHAR(*src) == 0xC0 && (src + 1 < srcEnd) && UCHAR(*(src+1)) == 0x80) { /* @@ -2338,35 +2339,48 @@ UtfToUtfProc( */ *dst++ = 0; + *chPtr = 0; /* reset surrogate handling */ src += 2; - } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { + } else if (!TclUCS4Complete(src, srcEnd - src)) { /* * Always check before using TclUtfToUniChar. Not doing can so * cause it run beyond the end of the buffer! If we happen such an * incomplete char its bytes are made to represent themselves. */ - *chPtr = (unsigned char) *src; + *chPtr = UCHAR(*src); src += 1; dst += Tcl_UniCharToUtf(*chPtr, dst); } else { - src += TclUtfToUniChar(src, chPtr); - if ((*chPtr | 0x7FF) == 0xDFFF) { + size_t len = TclUtfToUniChar(src, chPtr); + + src += len; + if ((*chPtr & ~0x7FF) == 0xD800) { + Tcl_UniChar low; /* A surrogate character is detected, handle especially */ - Tcl_UniChar low = *chPtr; - size_t len = (src <= srcEnd-3) ? Tcl_UtfToUniChar(src, &low) : 0; - if (((low | 0x3FF) != 0xDFFF) || (*chPtr & 0x400)) { - *dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF); - *dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF); - *dst++ = (char) ((*chPtr | 0x80) & 0xBF); - continue; +#if TCL_UTF_MAX <= 4 + if ((len < 3) && ((src[3 - len] & 0xC0) != 0x80)) { + /* It's invalid. See [ed29806ba] */ + *chPtr = UCHAR(src[-1]); + dst += Tcl_UniCharToUtf(*chPtr, dst); + continue; + } +#endif + low = *chPtr; + len = (src <= srcEnd-3) ? Tcl_UtfToUniChar(src, &low) : 0; + if (((low & ~0x3FF) != 0xDC00) || (*chPtr & 0x400)) { + *dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF); + *dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF); + *dst++ = (char) ((*chPtr | 0x80) & 0xBF); + *chPtr = 0; /* reset surrogate handling */ + continue; } else if ((TCL_UTF_MAX > 3) || (pureNullMode == 1)) { int full = (((*chPtr & 0x3FF) << 10) | (low & 0x3FF)) + 0x10000; *dst++ = (char) (((full >> 18) | 0xF0) & 0xF7); *dst++ = (char) (((full >> 12) | 0x80) & 0xBF); *dst++ = (char) (((full >> 6) | 0x80) & 0xBF); *dst++ = (char) ((full | 0x80) & 0xBF); - *chPtr = 0; + *chPtr = 0; /* reset surrogate handling */ src += len; continue; } |
