summaryrefslogtreecommitdiffstats
path: root/generic
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2020-05-01 13:38:22 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2020-05-01 13:38:22 (GMT)
commit9eaf82b745ac07bc55f7238813c449fc5a447cf8 (patch)
treef421a15863ac0ae1148013bf95a401b8eeba0357 /generic
parentba28f4892362a62309d8809b4dc5099a888a9f91 (diff)
parent62c00ac54a6f93ad1324d7e7aa5ef43623ca2415 (diff)
downloadtcl-9eaf82b745ac07bc55f7238813c449fc5a447cf8.zip
tcl-9eaf82b745ac07bc55f7238813c449fc5a447cf8.tar.gz
tcl-9eaf82b745ac07bc55f7238813c449fc5a447cf8.tar.bz2
Fix first part of [ed29806baf]: Tcl_UtfToUniChar reads more than TCL_UTF_MAX bytes.
Tcl_UtfToUniChar() now never reads more than TCL_UTF_MAX bytes any more. The UtfToUtf encoder/decoder is adapted to do attitional checks (more tricky than in Tcl 8.7, since we want compatibility with earlier 8.6 releases). Other callers of Tcl_UtfToUniChar() needs to be revised for the same problem. Most callers will need to change Tcl_UtfToUniChar() -> TclUtfToUCS4() and Tcl_UtfCharComplete() -> TclUCS4Complete(), but that's not done yet.
Diffstat (limited to 'generic')
-rw-r--r--generic/tclEncoding.c38
-rw-r--r--generic/tclInt.h10
-rw-r--r--generic/tclUtf.c20
3 files changed, 42 insertions, 26 deletions
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 6ab0510..1584de0 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2331,6 +2331,7 @@ UtfToUtfProc(
*/
*dst++ = *src++;
+ *chPtr = 0; /* reset surrogate handling */
} else if (pureNullMode == 1 && UCHAR(*src) == 0xC0 &&
(src + 1 < srcEnd) && UCHAR(*(src+1)) == 0x80) {
/*
@@ -2338,35 +2339,48 @@ UtfToUtfProc(
*/
*dst++ = 0;
+ *chPtr = 0; /* reset surrogate handling */
src += 2;
- } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
+ } else if (!TclUCS4Complete(src, srcEnd - src)) {
/*
* Always check before using TclUtfToUniChar. Not doing can so
* cause it run beyond the end of the buffer! If we happen such an
* incomplete char its bytes are made to represent themselves.
*/
- *chPtr = (unsigned char) *src;
+ *chPtr = UCHAR(*src);
src += 1;
dst += Tcl_UniCharToUtf(*chPtr, dst);
} else {
- src += TclUtfToUniChar(src, chPtr);
- if ((*chPtr | 0x7FF) == 0xDFFF) {
+ size_t len = TclUtfToUniChar(src, chPtr);
+
+ src += len;
+ if ((*chPtr & ~0x7FF) == 0xD800) {
+ Tcl_UniChar low;
/* A surrogate character is detected, handle especially */
- Tcl_UniChar low = *chPtr;
- size_t len = (src <= srcEnd-3) ? Tcl_UtfToUniChar(src, &low) : 0;
- if (((low | 0x3FF) != 0xDFFF) || (*chPtr & 0x400)) {
- *dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF);
- *dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF);
- *dst++ = (char) ((*chPtr | 0x80) & 0xBF);
- continue;
+#if TCL_UTF_MAX <= 4
+ if ((len < 3) && ((src[3 - len] & 0xC0) != 0x80)) {
+ /* It's invalid. See [ed29806ba] */
+ *chPtr = UCHAR(src[-1]);
+ dst += Tcl_UniCharToUtf(*chPtr, dst);
+ continue;
+ }
+#endif
+ low = *chPtr;
+ len = (src <= srcEnd-3) ? Tcl_UtfToUniChar(src, &low) : 0;
+ if (((low & ~0x3FF) != 0xDC00) || (*chPtr & 0x400)) {
+ *dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF);
+ *dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF);
+ *dst++ = (char) ((*chPtr | 0x80) & 0xBF);
+ *chPtr = 0; /* reset surrogate handling */
+ continue;
} else if ((TCL_UTF_MAX > 3) || (pureNullMode == 1)) {
int full = (((*chPtr & 0x3FF) << 10) | (low & 0x3FF)) + 0x10000;
*dst++ = (char) (((full >> 18) | 0xF0) & 0xF7);
*dst++ = (char) (((full >> 12) | 0x80) & 0xBF);
*dst++ = (char) (((full >> 6) | 0x80) & 0xBF);
*dst++ = (char) ((full | 0x80) & 0xBF);
- *chPtr = 0;
+ *chPtr = 0; /* reset surrogate handling */
src += len;
continue;
}
diff --git a/generic/tclInt.h b/generic/tclInt.h
index 5df9aac..593d878 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -3184,6 +3184,8 @@ MODULE_SCOPE int TclTrimRight(const char *bytes, int numBytes,
const char *trim, int numTrim);
MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct);
MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr);
+# define TclUCS4Complete(src, length) (((unsigned)(UCHAR(*(src)) - 0xF0) < 5) \
+ ? ((length) >= 4) : Tcl_UtfCharComplete((src), (length)))
MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(ClientData clientData);
MODULE_SCOPE Tcl_Obj * TclpFilesystemPathType(Tcl_Obj *pathPtr);
MODULE_SCOPE int TclpDlopen(Tcl_Interp *interp, Tcl_Obj *pathPtr,
@@ -4436,8 +4438,8 @@ MODULE_SCOPE void TclDbInitNewObj(Tcl_Obj *objPtr, const char *file,
*/
#define TclUtfToUniChar(str, chPtr) \
- ((((unsigned char) *(str)) < 0x80) ? \
- ((*(chPtr) = (unsigned char) *(str)), 1) \
+ (((UCHAR(*(str))) < 0x80) ? \
+ ((*(chPtr) = UCHAR(*(str))), 1) \
: Tcl_UtfToUniChar(str, chPtr))
/*
@@ -4466,11 +4468,11 @@ MODULE_SCOPE void TclDbInitNewObj(Tcl_Obj *objPtr, const char *file,
#define TclUtfPrev(src, start) \
(((src) < (start)+2) ? (start) : \
- ((unsigned char) *(src - 1)) < 0x80 ? (src)-1 : \
+ (UCHAR(*((src) - 1))) < 0x80 ? (src)-1 : \
Tcl_UtfPrev(src, start))
#define TclUtfNext(src) \
- ((((unsigned char) *(src)) < 0x80) ? src + 1 : Tcl_UtfNext(src))
+ (((UCHAR(*(src))) < 0x80) ? (src) + 1 : Tcl_UtfNext(src))
/*
*----------------------------------------------------------------
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 712beaa..9ffbfba 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -431,17 +431,17 @@ Tcl_UtfToUniChar(
if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
/*
* Four-byte-character lead byte followed by at least two trail bytes.
- * (validity of 3th trail byte will be tested later)
+ * We don't test the validity of 3th trail byte, see [ed29806ba]
*/
#if TCL_UTF_MAX <= 4
Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2)
| ((src[2] & 0x3F) >> 4)) - 0x40;
- if ((high < 0x400) && ((src[3] & 0xC0) == 0x80)) {
+ if (high < 0x400) {
/* produce high surrogate, advance source pointer */
*chPtr = 0xD800 + high;
return 1;
}
- /* out of range, < 0x10000 or > 0x10FFFF or invalid 3th byte */
+ /* out of range, < 0x10000 or > 0x10FFFF */
#else
if ((src[3] & 0xC0) == 0x80) {
*chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
@@ -557,7 +557,7 @@ Tcl_UtfCharComplete(
* a complete UTF-8 character. */
int length) /* Length of above string in bytes. */
{
- return length >= totalBytes[(unsigned char)*src];
+ return length >= totalBytes[UCHAR(*src)];
}
/*
@@ -604,7 +604,7 @@ Tcl_NumUtfChars(
register const char *endPtr = src + length - TCL_UTF_MAX;
while (src < endPtr) {
- if (((unsigned)(unsigned char)*src - 0xF0) < 5) {
+ if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
/* treat F0 - F4 as single character */
ch = 0;
src++;
@@ -615,7 +615,7 @@ Tcl_NumUtfChars(
}
endPtr += TCL_UTF_MAX;
while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
- if (((unsigned)(unsigned char)*src - 0xF0) < 5) {
+ if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
/* treat F0 - F4 as single character */
ch = 0;
src++;
@@ -1031,7 +1031,7 @@ Tcl_UtfToUpper(
* char to dst if its size is <= the original char.
*/
- if (len < UtfCount(upChar) || ((upChar & 0xF800) == 0xD800)) {
+ if (len < UtfCount(upChar) || ((upChar & ~0x7FF) == 0xD800)) {
memmove(dst, src, len);
dst += len;
} else {
@@ -1084,7 +1084,7 @@ Tcl_UtfToLower(
* char to dst if its size is <= the original char.
*/
- if (len < UtfCount(lowChar) || ((lowChar & 0xF800) == 0xD800)) {
+ if (len < UtfCount(lowChar) || ((lowChar & ~0x7FF) == 0xD800)) {
memmove(dst, src, len);
dst += len;
} else {
@@ -1134,7 +1134,7 @@ Tcl_UtfToTitle(
len = TclUtfToUCS4(src, &ch);
titleChar = UCS4ToTitle(ch);
- if (len < UtfCount(titleChar) || ((titleChar & 0xF800) == 0xD800)) {
+ if (len < UtfCount(titleChar) || ((titleChar & ~0x7FF) == 0xD800)) {
memmove(dst, src, len);
dst += len;
} else {
@@ -1150,7 +1150,7 @@ Tcl_UtfToTitle(
lowChar = UCS4ToLower(lowChar);
}
- if (len < UtfCount(lowChar) || ((lowChar & 0xF800) == 0xD800)) {
+ if (len < UtfCount(lowChar) || ((lowChar & ~0x7FF) == 0xD800)) {
memmove(dst, src, len);
dst += len;
} else {