summaryrefslogtreecommitdiffstats
path: root/generic
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2020-05-01 14:20:21 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2020-05-01 14:20:21 (GMT)
commitc0381fd712c458f3b4ee476e00d0857bec4cf5cc (patch)
tree073e28ab81f4331a3b1b9dceea126f1e278bce76 /generic
parenta3d900f322a935841622ac0c8d81a45c91a16e65 (diff)
parent9eaf82b745ac07bc55f7238813c449fc5a447cf8 (diff)
downloadtcl-c0381fd712c458f3b4ee476e00d0857bec4cf5cc.zip
tcl-c0381fd712c458f3b4ee476e00d0857bec4cf5cc.tar.gz
tcl-c0381fd712c458f3b4ee476e00d0857bec4cf5cc.tar.bz2
Fix first part of [ed29806baf]: Tcl_UtfToUniChar reads more than TCL_UTF_MAX bytes.
Tcl_UtfToUniChar() now never reads more than TCL_UTF_MAX bytes any more. Since the UtfToUtf encoder/decoder now uses TclUtfToUCS4() it doesn't join 2 surrogates as 2 x 3-byte sequences any more. Actually, it shouldn't, because such sequences are invalid UTF-8. Therefore, added the ucs2 constraint to testcase encoding-15.4. Let's see how TIP #573 goes, this TIP should make this change official. Other callers of Tcl_UtfToUniChar() needs to be revised for the same problem. Most callers will need to change Tcl_UtfToUniChar() -> TclUtfToUCS4() and Tcl_UtfCharComplete() -> TclUCS4Complete(), but that's not done yet.
Diffstat (limited to 'generic')
-rw-r--r--generic/tclDecls.h2
-rw-r--r--generic/tclEncoding.c20
-rw-r--r--generic/tclInt.h9
-rw-r--r--generic/tclUtf.c16
4 files changed, 26 insertions, 21 deletions
diff --git a/generic/tclDecls.h b/generic/tclDecls.h
index 4531be3..c713469 100644
--- a/generic/tclDecls.h
+++ b/generic/tclDecls.h
@@ -4181,7 +4181,7 @@ extern const TclStubs *tclStubsPtr;
#if defined(USE_TCL_STUBS) && (TCL_UTF_MAX > 3)
# undef Tcl_UtfCharComplete
# define Tcl_UtfCharComplete(src, length) (((unsigned)((unsigned char)*(src) - 0xF0) < 5) \
- ? 4 : tclStubsPtr->tcl_UtfCharComplete((src), (length)))
+ ? ((length) >= 4) : tclStubsPtr->tcl_UtfCharComplete((src), (length)))
#endif
#endif /* _TCLDECLS */
diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c
index 4789b7f..444f99e 100644
--- a/generic/tclEncoding.c
+++ b/generic/tclEncoding.c
@@ -2300,7 +2300,7 @@ UtfToUtfProc(
const char *srcStart, *srcEnd, *srcClose;
const char *dstStart, *dstEnd;
int result, numChars, charLimit = INT_MAX;
- Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr;
+ int *chPtr = (int *) statePtr;
if (flags & TCL_ENCODING_START) {
*statePtr = 0;
@@ -2321,7 +2321,7 @@ UtfToUtfProc(
dstEnd = dst + dstLen - TCL_UTF_MAX;
for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
- if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
+ if ((src > srcClose) && (!TclUCS4Complete(src, srcEnd - src))) {
/*
* If there is more string to follow, this will ensure that the
* last UTF-8 character in the source buffer hasn't been cut off.
@@ -2341,6 +2341,7 @@ UtfToUtfProc(
*/
*dst++ = *src++;
+ *chPtr = 0; /* reset surrogate handling */
} else if (pureNullMode == 1 && UCHAR(*src) == 0xC0 &&
(src + 1 < srcEnd) && UCHAR(*(src+1)) == 0x80) {
/*
@@ -2348,24 +2349,25 @@ UtfToUtfProc(
*/
*dst++ = 0;
+ *chPtr = 0; /* reset surrogate handling */
src += 2;
- } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
+ } else if (!TclUCS4Complete(src, srcEnd - src)) {
/*
- * Always check before using TclUtfToUniChar. Not doing can so
+ * Always check before using TclUtfToUCS4. Not doing can so
* cause it run beyond the end of the buffer! If we happen such an
* incomplete char its bytes are made to represent themselves.
*/
- *chPtr = (unsigned char) *src;
+ *chPtr = UCHAR(*src);
src += 1;
dst += Tcl_UniCharToUtf(*chPtr, dst);
} else {
- src += TclUtfToUniChar(src, chPtr);
+ src += TclUtfToUCS4(src, chPtr);
if ((*chPtr | 0x7FF) == 0xDFFF) {
/* A surrogate character is detected, handle especially */
- Tcl_UniChar low = *chPtr;
- size_t len = (src <= srcEnd-3) ? Tcl_UtfToUniChar(src, &low) : 0;
- if (((low | 0x3FF) != 0xDFFF) || (*chPtr & 0x400)) {
+ int low = *chPtr;
+ size_t len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
+ if (((low & ~0x3FF) != 0xC00) || (*chPtr & 0x400)) {
*dst++ = (char) (((*chPtr >> 12) | 0xE0) & 0xEF);
*dst++ = (char) (((*chPtr >> 6) | 0x80) & 0xBF);
*dst++ = (char) ((*chPtr | 0x80) & 0xBF);
diff --git a/generic/tclInt.h b/generic/tclInt.h
index 2ff644e..9ef1065 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -3252,8 +3252,11 @@ MODULE_SCOPE int TclUtfCasecmp(const char *cs, const char *ct);
MODULE_SCOPE int TclUtfCount(int ch);
#if TCL_UTF_MAX > 3
# define TclUtfToUCS4 Tcl_UtfToUniChar
+# define TclUCS4Complete Tcl_UtfCharComplete
#else
- MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr);
+ MODULE_SCOPE int TclUtfToUCS4(const char *src, int *ucs4Ptr);
+# define TclUCS4Complete(src, length) (((unsigned)((unsigned char)*(src) - 0xF0) < 5) \
+ ? ((length) >= 4) : Tcl_UtfCharComplete((src), (length)))
#endif
MODULE_SCOPE Tcl_Obj * TclpNativeToNormalized(ClientData clientData);
MODULE_SCOPE Tcl_Obj * TclpFilesystemPathType(Tcl_Obj *pathPtr);
@@ -4655,8 +4658,8 @@ MODULE_SCOPE const TclFileAttrProcs tclpFileAttrProcs[];
#if TCL_UTF_MAX > 3
#define TclUtfToUniChar(str, chPtr) \
- ((((unsigned char) *(str)) < 0x80) ? \
- ((*(chPtr) = (unsigned char) *(str)), 1) \
+ (((UCHAR(*(str))) < 0x80) ? \
+ ((*(chPtr) = UCHAR(*(str))), 1) \
: Tcl_UtfToUniChar(str, chPtr))
#else
#define TclUtfToUniChar(str, chPtr) \
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 90e974f..22315b4 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -597,16 +597,16 @@ Tcl_UtfToChar16(
if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
/*
* Four-byte-character lead byte followed by at least two trail bytes.
- * (validity of 3th trail byte will be tested later)
+ * We don't test the validity of 3th trail byte, see [ed29806ba]
*/
Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2)
| ((src[2] & 0x3F) >> 4)) - 0x40;
- if ((high < 0x400) && ((src[3] & 0xC0) == 0x80)) {
+ if (high < 0x400) {
/* produce high surrogate, advance source pointer */
*chPtr = 0xD800 + high;
return 1;
}
- /* out of range, < 0x10000 or > 0x10FFFF or invalid 3th byte */
+ /* out of range, < 0x10000 or > 0x10FFFF */
}
/*
@@ -771,7 +771,7 @@ Tcl_UtfCharComplete(
* a complete UTF-8 character. */
int length) /* Length of above string in bytes. */
{
- return length >= complete[(unsigned char)*src];
+ return length >= complete[UCHAR(*src)];
}
/*
@@ -1238,7 +1238,7 @@ Tcl_UtfToUpper(
* char to dst if its size is <= the original char.
*/
- if ((len < TclUtfCount(upChar)) || ((upChar & 0xF800) == 0xD800)) {
+ if ((len < TclUtfCount(upChar)) || ((upChar & ~0x7FF) == 0xD800)) {
memmove(dst, src, len);
dst += len;
} else {
@@ -1291,7 +1291,7 @@ Tcl_UtfToLower(
* char to dst if its size is <= the original char.
*/
- if ((len < TclUtfCount(lowChar)) || ((lowChar & 0xF800) == 0xD800)) {
+ if ((len < TclUtfCount(lowChar)) || ((lowChar & ~0x7FF) == 0xD800)) {
memmove(dst, src, len);
dst += len;
} else {
@@ -1341,7 +1341,7 @@ Tcl_UtfToTitle(
len = TclUtfToUCS4(src, &ch);
titleChar = Tcl_UniCharToTitle(ch);
- if ((len < TclUtfCount(titleChar)) || ((titleChar & 0xF800) == 0xD800)) {
+ if ((len < TclUtfCount(titleChar)) || ((titleChar & ~0x7FF) == 0xD800)) {
memmove(dst, src, len);
dst += len;
} else {
@@ -1357,7 +1357,7 @@ Tcl_UtfToTitle(
lowChar = Tcl_UniCharToLower(lowChar);
}
- if ((len < TclUtfCount(lowChar)) || ((lowChar & 0xF800) == 0xD800)) {
+ if ((len < TclUtfCount(lowChar)) || ((lowChar & ~0x7FF) == 0xD800)) {
memmove(dst, src, len);
dst += len;
} else {