diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2020-05-01 14:20:21 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2020-05-01 14:20:21 (GMT) |
commit | c0381fd712c458f3b4ee476e00d0857bec4cf5cc (patch) | |
tree | 073e28ab81f4331a3b1b9dceea126f1e278bce76 /generic | |
parent | a3d900f322a935841622ac0c8d81a45c91a16e65 (diff) | |
parent | 9eaf82b745ac07bc55f7238813c449fc5a447cf8 (diff) | |
download | tcl-c0381fd712c458f3b4ee476e00d0857bec4cf5cc.zip tcl-c0381fd712c458f3b4ee476e00d0857bec4cf5cc.tar.gz tcl-c0381fd712c458f3b4ee476e00d0857bec4cf5cc.tar.bz2 |
Fix first part of [ed29806baf]: Tcl_UtfToUniChar reads more than TCL_UTF_MAX bytes.
Tcl_UtfToUniChar() now never reads more than TCL_UTF_MAX bytes any more.
Since the UtfToUtf encoder/decoder now uses TclUtfToUCS4() it doesn't join 2 surrogates as 2 x 3-byte sequences any more. Actually, it shouldn't, because such sequences are invalid UTF-8.
Therefore, added the ucs2 constraint to testcase encoding-15.4. Let's see how TIP #573 goes, this TIP should make this change official.
Other callers of Tcl_UtfToUniChar() needs to be revised for the same problem. Most callers will need to change Tcl_UtfToUniChar() -> TclUtfToUCS4() and Tcl_UtfCharComplete() -> TclUCS4Complete(), but that's not done yet.
Diffstat (limited to 'generic')
-rw-r--r-- | generic/tclDecls.h | 2 | ||||
-rw-r--r-- | generic/tclEncoding.c | 20 | ||||
-rw-r--r-- | generic/tclInt.h | 9 | ||||
-rw-r--r-- | generic/tclUtf.c | 16 |
4 files changed, 26 insertions, 21 deletions
diff --git a/generic/tclDecls.h b/generic/tclDecls.h index 4531be3..c713469 100644 --- a/generic/tclDecls.h +++ b/generic/tclDecls.h @@ -4181,7 +4181,7 @@ extern const TclStubs *tclStubsPtr; #if defined(USE_TCL_STUBS) && (TCL_UTF_MAX > 3) # undef Tcl_UtfCharComplete # define Tcl_UtfCharComplete(src, length) (((unsigned)((unsigned char)*(src) - 0xF0) < 5) \ - ? 4 : tclStubsPtr->tcl_UtfCharComplete((src), (length))) + ? ((length) >= 4) : tclStubsPtr->tcl_UtfCharComplete((src), (length))) #endif #endif /* _TCLDECLS */ diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 4789b7f..444f99e 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2300,7 +2300,7 @@ UtfToUtfProc( const char *srcStart, *srcEnd, *srcClose; const char *dstStart, *dstEnd; int result, numChars, charLimit = INT_MAX; - Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr; + int *chPtr = (int *) statePtr; if (flags & TCL_ENCODING_START) { *statePtr = 0; @@ -2321,7 +2321,7 @@ UtfToUtfProc( dstEnd = dst + dstLen - TCL_UTF_MAX; for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { - if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { + if ((src > srcClose) && (!TclUCS4Complete(src, srcEnd - src))) { /* * If there is more string to follow, this will ensure that the * last UTF-8 character in the source buffer hasn't been cut off. @@ -2341,6 +2341,7 @@ UtfToUtfProc( */ *dst++ = *src++; + *chPtr = 0; /* reset surrogate handling */ } else if (pureNullMode == 1 && UCHAR(*src) == 0xC0 && (src + 1 < srcEnd) && UCHAR(*(src+1)) == 0x80) { /* @@ -2348,24 +2349,25 @@ UtfToUtfProc( */ *dst++ = 0; + *chPtr = 0; /* reset surrogate handling */ src += 2; - } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { + } else if (!TclUCS4Complete(src, srcEnd - src)) { /* - * Always check before using TclUtfToUniChar. Not doing can so + * Always check before using TclUtfToUCS4. Not doing can so * cause it run beyond the end of the buffer! If we happen such an * incomplete char its bytes are made to represent themselves. */ - *chPtr = (unsigned char) *src; + *chPtr = UCHAR(*src); src += 1; dst += Tcl_UniCharToUtf(*chPtr, dst); } else { - src += TclUtfToUniChar(src, chPtr); + src += TclUtfToUCS4(src, chPtr); if ((*chPtr | 0x7FF) == 0xDFFF) { /* A surrogate character is detected, handle especially */ - Tcl_UniChar low = *chPtr; - size_t len = (src <= srcEnd-3) ? Tcl_UtfToUniChar(src, &low) : 0; - if (((low | 0x3FF) != 0xDFFF) || (*chPtr & 0x400)) { + int low = *chPtr; + size_t len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; + if (((low & ~0x3FF) != 0xC00) || (*chPtr & 0x400)) { *dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF); *dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF); *dst++ = (char) ((*chPtr | 0x80) & 0xBF); diff --git a/generic/tclInt.h b/generic/tclInt.h index 2ff644e..9ef1065 100644 --- a/generic/tclInt.h +++ b/generic/tclInt.h @@ -3252,8 +3252,11 @@ MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct); MODULE_SCOPE int TclUtfCount(int ch); #if TCL_UTF_MAX > 3 # define TclUtfToUCS4 Tcl_UtfToUniChar +# define TclUCS4Complete Tcl_UtfCharComplete #else - MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr); + MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr); +# define TclUCS4Complete(src, length) (((unsigned)((unsigned char)*(src) - 0xF0) < 5) \ + ? ((length) >= 4) : Tcl_UtfCharComplete((src), (length))) #endif MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(ClientData clientData); MODULE_SCOPE Tcl_Obj * TclpFilesystemPathType(Tcl_Obj *pathPtr); @@ -4655,8 +4658,8 @@ MODULE_SCOPE const TclFileAttrProcs tclpFileAttrProcs[]; #if TCL_UTF_MAX > 3 #define TclUtfToUniChar(str, chPtr) \ - ((((unsigned char) *(str)) < 0x80) ? \ - ((*(chPtr) = (unsigned char) *(str)), 1) \ + (((UCHAR(*(str))) < 0x80) ? \ + ((*(chPtr) = UCHAR(*(str))), 1) \ : Tcl_UtfToUniChar(str, chPtr)) #else #define TclUtfToUniChar(str, chPtr) \ diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 90e974f..22315b4 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -597,16 +597,16 @@ Tcl_UtfToChar16( if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { /* * Four-byte-character lead byte followed by at least two trail bytes. - * (validity of 3th trail byte will be tested later) + * We don't test the validity of 3th trail byte, see [ed29806ba] */ Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2) | ((src[2] & 0x3F) >> 4)) - 0x40; - if ((high < 0x400) && ((src[3] & 0xC0) == 0x80)) { + if (high < 0x400) { /* produce high surrogate, advance source pointer */ *chPtr = 0xD800 + high; return 1; } - /* out of range, < 0x10000 or > 0x10FFFF or invalid 3th byte */ + /* out of range, < 0x10000 or > 0x10FFFF */ } /* @@ -771,7 +771,7 @@ Tcl_UtfCharComplete( * a complete UTF-8 character. */ int length) /* Length of above string in bytes. */ { - return length >= complete[(unsigned char)*src]; + return length >= complete[UCHAR(*src)]; } /* @@ -1238,7 +1238,7 @@ Tcl_UtfToUpper( * char to dst if its size is <= the original char. */ - if ((len < TclUtfCount(upChar)) || ((upChar & 0xF800) == 0xD800)) { + if ((len < TclUtfCount(upChar)) || ((upChar & ~0x7FF) == 0xD800)) { memmove(dst, src, len); dst += len; } else { @@ -1291,7 +1291,7 @@ Tcl_UtfToLower( * char to dst if its size is <= the original char. */ - if ((len < TclUtfCount(lowChar)) || ((lowChar & 0xF800) == 0xD800)) { + if ((len < TclUtfCount(lowChar)) || ((lowChar & ~0x7FF) == 0xD800)) { memmove(dst, src, len); dst += len; } else { @@ -1341,7 +1341,7 @@ Tcl_UtfToTitle( len = TclUtfToUCS4(src, &ch); titleChar = Tcl_UniCharToTitle(ch); - if ((len < TclUtfCount(titleChar)) || ((titleChar & 0xF800) == 0xD800)) { + if ((len < TclUtfCount(titleChar)) || ((titleChar & ~0x7FF) == 0xD800)) { memmove(dst, src, len); dst += len; } else { @@ -1357,7 +1357,7 @@ Tcl_UtfToTitle( lowChar = Tcl_UniCharToLower(lowChar); } - if ((len < TclUtfCount(lowChar)) || ((lowChar & 0xF800) == 0xD800)) { + if ((len < TclUtfCount(lowChar)) || ((lowChar & ~0x7FF) == 0xD800)) { memmove(dst, src, len); dst += len; } else { |