diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2020-05-01 13:38:22 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2020-05-01 13:38:22 (GMT) |
commit | 9eaf82b745ac07bc55f7238813c449fc5a447cf8 (patch) | |
tree | f421a15863ac0ae1148013bf95a401b8eeba0357 /generic | |
parent | ba28f4892362a62309d8809b4dc5099a888a9f91 (diff) | |
parent | 62c00ac54a6f93ad1324d7e7aa5ef43623ca2415 (diff) | |
download | tcl-9eaf82b745ac07bc55f7238813c449fc5a447cf8.zip tcl-9eaf82b745ac07bc55f7238813c449fc5a447cf8.tar.gz tcl-9eaf82b745ac07bc55f7238813c449fc5a447cf8.tar.bz2 |
Fix first part of [ed29806baf]: Tcl_UtfToUniChar reads more than TCL_UTF_MAX bytes.
Tcl_UtfToUniChar() now never reads more than TCL_UTF_MAX bytes any more. The UtfToUtf encoder/decoder is adapted to do attitional checks (more tricky than in Tcl 8.7, since we want compatibility with earlier 8.6 releases).
Other callers of Tcl_UtfToUniChar() needs to be revised for the same problem. Most callers will need to change Tcl_UtfToUniChar() -> TclUtfToUCS4() and Tcl_UtfCharComplete() -> TclUCS4Complete(), but that's not done yet.
Diffstat (limited to 'generic')
-rw-r--r-- | generic/tclEncoding.c | 38 | ||||
-rw-r--r-- | generic/tclInt.h | 10 | ||||
-rw-r--r-- | generic/tclUtf.c | 20 |
3 files changed, 42 insertions, 26 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 6ab0510..1584de0 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2331,6 +2331,7 @@ UtfToUtfProc( */ *dst++ = *src++; + *chPtr = 0; /* reset surrogate handling */ } else if (pureNullMode == 1 && UCHAR(*src) == 0xC0 && (src + 1 < srcEnd) && UCHAR(*(src+1)) == 0x80) { /* @@ -2338,35 +2339,48 @@ UtfToUtfProc( */ *dst++ = 0; + *chPtr = 0; /* reset surrogate handling */ src += 2; - } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { + } else if (!TclUCS4Complete(src, srcEnd - src)) { /* * Always check before using TclUtfToUniChar. Not doing can so * cause it run beyond the end of the buffer! If we happen such an * incomplete char its bytes are made to represent themselves. */ - *chPtr = (unsigned char) *src; + *chPtr = UCHAR(*src); src += 1; dst += Tcl_UniCharToUtf(*chPtr, dst); } else { - src += TclUtfToUniChar(src, chPtr); - if ((*chPtr | 0x7FF) == 0xDFFF) { + size_t len = TclUtfToUniChar(src, chPtr); + + src += len; + if ((*chPtr & ~0x7FF) == 0xD800) { + Tcl_UniChar low; /* A surrogate character is detected, handle especially */ - Tcl_UniChar low = *chPtr; - size_t len = (src <= srcEnd-3) ? Tcl_UtfToUniChar(src, &low) : 0; - if (((low | 0x3FF) != 0xDFFF) || (*chPtr & 0x400)) { - *dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF); - *dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF); - *dst++ = (char) ((*chPtr | 0x80) & 0xBF); - continue; +#if TCL_UTF_MAX <= 4 + if ((len < 3) && ((src[3 - len] & 0xC0) != 0x80)) { + /* It's invalid. See [ed29806ba] */ + *chPtr = UCHAR(src[-1]); + dst += Tcl_UniCharToUtf(*chPtr, dst); + continue; + } +#endif + low = *chPtr; + len = (src <= srcEnd-3) ? Tcl_UtfToUniChar(src, &low) : 0; + if (((low & ~0x3FF) != 0xDC00) || (*chPtr & 0x400)) { + *dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF); + *dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF); + *dst++ = (char) ((*chPtr | 0x80) & 0xBF); + *chPtr = 0; /* reset surrogate handling */ + continue; } else if ((TCL_UTF_MAX > 3) || (pureNullMode == 1)) { int full = (((*chPtr & 0x3FF) << 10) | (low & 0x3FF)) + 0x10000; *dst++ = (char) (((full >> 18) | 0xF0) & 0xF7); *dst++ = (char) (((full >> 12) | 0x80) & 0xBF); *dst++ = (char) (((full >> 6) | 0x80) & 0xBF); *dst++ = (char) ((full | 0x80) & 0xBF); - *chPtr = 0; + *chPtr = 0; /* reset surrogate handling */ src += len; continue; } diff --git a/generic/tclInt.h b/generic/tclInt.h index 5df9aac..593d878 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3184,6 +3184,8 @@ MODULE_SCOPE int TclTrimRight(const char *bytes, int numBytes, const char *trim, int numTrim); MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct); MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr); +# define TclUCS4Complete(src, length) (((unsigned)(UCHAR(*(src)) - 0xF0) < 5) \ + ? ((length) >= 4) : Tcl_UtfCharComplete((src), (length))) MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(ClientData clientData); MODULE_SCOPE Tcl_Obj * TclpFilesystemPathType(Tcl_Obj *pathPtr); MODULE_SCOPE int TclpDlopen(Tcl_Interp *interp, Tcl_Obj *pathPtr, @@ -4436,8 +4438,8 @@ MODULE_SCOPE void TclDbInitNewObj(Tcl_Obj *objPtr, const char *file, */ #define TclUtfToUniChar(str, chPtr) \ - ((((unsigned char) *(str)) < 0x80) ? \ - ((*(chPtr) = (unsigned char) *(str)), 1) \ + (((UCHAR(*(str))) < 0x80) ? \ + ((*(chPtr) = UCHAR(*(str))), 1) \ : Tcl_UtfToUniChar(str, chPtr)) /* @@ -4466,11 +4468,11 @@ MODULE_SCOPE void TclDbInitNewObj(Tcl_Obj *objPtr, const char *file, #define TclUtfPrev(src, start) \ (((src) < (start)+2) ? (start) : \ - ((unsigned char) *(src - 1)) < 0x80 ? (src)-1 : \ + (UCHAR(*((src) - 1))) < 0x80 ? (src)-1 : \ Tcl_UtfPrev(src, start)) #define TclUtfNext(src) \ - ((((unsigned char) *(src)) < 0x80) ? src + 1 : Tcl_UtfNext(src)) + (((UCHAR(*(src))) < 0x80) ? (src) + 1 : Tcl_UtfNext(src)) /* *---------------------------------------------------------------- diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 712beaa..9ffbfba 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -431,17 +431,17 @@ Tcl_UtfToUniChar( if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { /* * Four-byte-character lead byte followed by at least two trail bytes. - * (validity of 3th trail byte will be tested later) + * We don't test the validity of 3th trail byte, see [ed29806ba] */ #if TCL_UTF_MAX <= 4 Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2) | ((src[2] & 0x3F) >> 4)) - 0x40; - if ((high < 0x400) && ((src[3] & 0xC0) == 0x80)) { + if (high < 0x400) { /* produce high surrogate, advance source pointer */ *chPtr = 0xD800 + high; return 1; } - /* out of range, < 0x10000 or > 0x10FFFF or invalid 3th byte */ + /* out of range, < 0x10000 or > 0x10FFFF */ #else if ((src[3] & 0xC0) == 0x80) { *chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) @@ -557,7 +557,7 @@ Tcl_UtfCharComplete( * a complete UTF-8 character. */ int length) /* Length of above string in bytes. */ { - return length >= totalBytes[(unsigned char)*src]; + return length >= totalBytes[UCHAR(*src)]; } /* @@ -604,7 +604,7 @@ Tcl_NumUtfChars( register const char *endPtr = src + length - TCL_UTF_MAX; while (src < endPtr) { - if (((unsigned)(unsigned char)*src - 0xF0) < 5) { + if (((unsigned)UCHAR(*src) - 0xF0) < 5) { /* treat F0 - F4 as single character */ ch = 0; src++; @@ -615,7 +615,7 @@ Tcl_NumUtfChars( } endPtr += TCL_UTF_MAX; while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) { - if (((unsigned)(unsigned char)*src - 0xF0) < 5) { + if (((unsigned)UCHAR(*src) - 0xF0) < 5) { /* treat F0 - F4 as single character */ ch = 0; src++; @@ -1031,7 +1031,7 @@ Tcl_UtfToUpper( * char to dst if its size is <= the original char. */ - if (len < UtfCount(upChar) || ((upChar & 0xF800) == 0xD800)) { + if (len < UtfCount(upChar) || ((upChar & ~0x7FF) == 0xD800)) { memmove(dst, src, len); dst += len; } else { @@ -1084,7 +1084,7 @@ Tcl_UtfToLower( * char to dst if its size is <= the original char. */ - if (len < UtfCount(lowChar) || ((lowChar & 0xF800) == 0xD800)) { + if (len < UtfCount(lowChar) || ((lowChar & ~0x7FF) == 0xD800)) { memmove(dst, src, len); dst += len; } else { @@ -1134,7 +1134,7 @@ Tcl_UtfToTitle( len = TclUtfToUCS4(src, &ch); titleChar = UCS4ToTitle(ch); - if (len < UtfCount(titleChar) || ((titleChar & 0xF800) == 0xD800)) { + if (len < UtfCount(titleChar) || ((titleChar & ~0x7FF) == 0xD800)) { memmove(dst, src, len); dst += len; } else { @@ -1150,7 +1150,7 @@ Tcl_UtfToTitle( lowChar = UCS4ToLower(lowChar); } - if (len < UtfCount(lowChar) || ((lowChar & 0xF800) == 0xD800)) { + if (len < UtfCount(lowChar) || ((lowChar & ~0x7FF) == 0xD800)) { memmove(dst, src, len); dst += len; } else { |