summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--generic/tclCmdMZ.c2
-rw-r--r--generic/tclInt.h8
-rw-r--r--generic/tclStringObj.c6
-rw-r--r--generic/tclTest.c6
-rw-r--r--generic/tclUtf.c179
-rw-r--r--generic/tclUtil.c4
-rw-r--r--tests/utf.test20
7 files changed, 179 insertions, 46 deletions
diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c
index 935d42f..d7394fb 100644
--- a/generic/tclCmdMZ.c
+++ b/generic/tclCmdMZ.c
@@ -2529,7 +2529,7 @@ StringStartCmd(
break;
}
- next = Tcl_UtfPrev(p, string);
+ next = TclUtfPrev(p, string);
do {
next += delta;
delta = TclUtfToUCS4(next, &ch);
diff --git a/generic/tclInt.h b/generic/tclInt.h
index 8d2e68a..2ff644e 100644
--- a/generic/tclInt.h
+++ b/generic/tclInt.h
@@ -4689,6 +4689,14 @@ MODULE_SCOPE const TclFileAttrProcs tclpFileAttrProcs[];
(numChars) = _count; \
} while (0);
+#define TclUtfPrev(src, start) \
+ (((src) < (start) + 2) ? (start) : \
+ ((unsigned char) *((src) - 1)) < 0x80 ? (src) - 1 : \
+ Tcl_UtfPrev(src, start))
+
+#define TclUtfNext(src) \
+ ((((unsigned char) *(src)) < 0x80) ? (src) + 1 : Tcl_UtfNext(src))
+
/*
*----------------------------------------------------------------
* Macro that encapsulates the logic that determines when it is safe to
diff --git a/generic/tclStringObj.c b/generic/tclStringObj.c
index 2025674..78e49f9 100644
--- a/generic/tclStringObj.c
+++ b/generic/tclStringObj.c
@@ -1171,10 +1171,10 @@ Tcl_AppendLimitedToObj(
}
eLen = strlen(ellipsis);
while (eLen > limit) {
- eLen = Tcl_UtfPrev(ellipsis+eLen, ellipsis) - ellipsis;
+ eLen = TclUtfPrev(ellipsis+eLen, ellipsis) - ellipsis;
}
- toCopy = Tcl_UtfPrev(bytes+limit+1-eLen, bytes) - bytes;
+ toCopy = TclUtfPrev(bytes+limit+1-eLen, bytes) - bytes;
}
/*
@@ -2614,7 +2614,7 @@ AppendPrintfToObjVA(
* multi-byte characters.
*/
- q = Tcl_UtfPrev(end, bytes);
+ q = TclUtfPrev(end, bytes);
if (!Tcl_UtfCharComplete(q, (int)(end - q))) {
end = q;
}
diff --git a/generic/tclTest.c b/generic/tclTest.c
index 008fdb7..19de8ee 100644
--- a/generic/tclTest.c
+++ b/generic/tclTest.c
@@ -6849,10 +6849,10 @@ TestUtfNextCmd(
memcpy(buffer + 1, bytes, numBytes);
buffer[0] = buffer[numBytes + 1] = '\x00';
- first = Tcl_UtfNext(buffer + 1);
+ first = TclUtfNext(buffer + 1);
while ((buffer[0] = *p++) != '\0') {
/* Run Tcl_UtfNext with many more possible bytes at src[-1], all should give the same result */
- result = Tcl_UtfNext(buffer + 1);
+ result = TclUtfNext(buffer + 1);
if (first != result) {
Tcl_AppendResult(interp, "Tcl_UtfNext is not supposed to read src[-1]", NULL);
return TCL_ERROR;
@@ -6908,7 +6908,7 @@ TestUtfPrevCmd(
bytes = (char *) Tcl_SetByteArrayLength(copy, numBytes+1);
bytes[numBytes] = '\0';
- result = Tcl_UtfPrev(bytes + offset, bytes);
+ result = TclUtfPrev(bytes + offset, bytes);
Tcl_SetObjResult(interp, Tcl_NewIntObj(result - bytes));
Tcl_DecrRefCount(copy);
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 6adfba9..20b0ebe 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -90,6 +90,12 @@ static const unsigned char complete[256] = {
#endif
1,1,1,1,1,1,1,1,1,1,1
};
+
+/*
+ * Functions used only in this module.
+ */
+
+static int Invalid(unsigned char *src);
/*
*---------------------------------------------------------------------------
@@ -122,7 +128,56 @@ TclUtfCount(
}
return 3;
}
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * Invalid --
+ *
+ * Utility routine to report whether /src/ points to the start of an
+ * invald byte sequence that should be rejected. This might be because
+ * it is an overlong encoding, or because it encodes something out of
+ * the proper range. Caller guarantees that src[0] and src[1] are
+ * readable, and
+ *
+ * (src[0] >= 0xC0) && (src[0] != 0xC1)
+ * (src[1] >= 0x80) && (src[1] < 0xC0)
+ * (src[0] < ((TCL_UTF_MAX > 3) ? 0xF5 : 0xF0))
+ *
+ * Results:
+ * A boolean.
+ *---------------------------------------------------------------------------
+ */
+
+static const unsigned char bounds[28] = {
+ 0x80, 0x80, /* \xC0 accepts \x80 only */
+ 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF,
+ 0x80, 0xBF, /* (\xC4 - \xDC) -- all sequences valid */
+ 0xA0, 0xBF, /* \xE0\x80 through \xE0\x9F are invalid prefixes */
+ 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, /* (\xE4 - \xEC) -- all valid */
+ 0x90, 0xBF, /* \xF0\x80 through \xF0\x8F are invalid prefixes */
+ 0x80, 0x8F /* \xF4\x90 and higher are invalid prefixes */
+};
+INLINE static int
+Invalid(
+ unsigned char *src) /* Points to lead byte of a UTF-8 byte sequence */
+{
+ unsigned char byte = *src;
+ int index;
+
+ if (byte % 0x04) {
+ /* Only lead bytes 0xC0, 0xE0, 0xF0, 0xF4 need examination */
+ return 0;
+ }
+ index = (byte - 0xC0) >> 1;
+ if (src[1] < bounds[index] || src[1] > bounds[index+1]) {
+ /* Out of bounds - report invalid. */
+ return 1;
+ }
+ return 0;
+}
+
/*
*---------------------------------------------------------------------------
*
@@ -860,7 +915,7 @@ Tcl_UtfFindLast(
*
* Tcl_UtfNext --
*
- * Given a pointer to some location in a UTF-8 string, Tcl_UtfNext
+ * Given a pointer to some location in a UTF-8 string, Tcl_UtfNext
* returns a pointer to the next UTF-8 character in the string.
* The caller must not ask for the next character after the last
* character in the string if the string is not terminated by a null
@@ -880,8 +935,8 @@ const char *
Tcl_UtfNext(
const char *src) /* The current location in the string. */
{
- Tcl_UniChar ch = 0;
- int len;
+ int left = totalBytes[UCHAR(*src)];
+ const char *next = src + 1;
if (((*src) & 0xC0) == 0x80) {
if ((((*++src) & 0xC0) == 0x80) && (((*++src) & 0xC0) == 0x80)) {
@@ -889,14 +944,22 @@ Tcl_UtfNext(
}
return src;
}
- len = TclUtfToUniChar(src, &ch);
-#if TCL_UTF_MAX <= 3
- if ((ch >= 0xD800) && (len < 3)) {
- len += TclUtfToUniChar(src + len, &ch);
+ while (--left) {
+ if ((*next & 0xC0) != 0x80) {
+ /*
+ * src points to non-trail byte; We ran out of trail bytes
+ * before the needs of the lead byte were satisfied.
+ * Let the (malformed) lead byte alone be a character
+ */
+ return src + 1;
+ }
+ next++;
}
-#endif
- return src + len;
+ if (Invalid((unsigned char *)src)) {
+ return src + 1;
+ }
+ return next;
}
/*
@@ -953,30 +1016,92 @@ Tcl_UtfPrev(
const char *src, /* A location in a UTF-8 string. */
const char *start) /* Pointer to the beginning of the string */
{
- const char *look;
- int i, byte;
-
- look = --src;
- for (i = 0; i < 4; i++) {
- if (look < start) {
- if (src < start) {
- src = start;
- }
- break;
- }
- byte = *((unsigned char *) look);
+ int trailBytesSeen = 0; /* How many trail bytes have been verified? */
+ CONST char *fallback = src - 1;
+ /* If we cannot find a lead byte that might
+ * start a prefix of a valid UTF byte sequence,
+ * we will fallback to a one-byte back step */
+ unsigned char *look = (unsigned char *)fallback;
+ /* Start search at the fallback position */
+
+ /* Quick boundary case exit. */
+ if (fallback <= start) {
+ return start;
+ }
+
+ do {
+ unsigned char byte = look[0];
+
if (byte < 0x80) {
- break;
+ /*
+ * Single byte character. Either this is a correct previous
+ * character, or it is followed by at least one trail byte
+ * which indicates a malformed sequence. In either case the
+ * correct result is to return the fallback.
+ */
+ return fallback;
}
if (byte >= 0xC0) {
- if (totalBytes[byte] <= i) {
- break;
+ /* Non-trail byte; May be multibyte lead. */
+
+ if ((trailBytesSeen == 0)
+ /*
+ * We've seen no trailing context to use to check
+ * anything. From what we know, this non-trail byte
+ * is a prefix of a previous character, and accepting
+ * it (the fallback) is correct.
+ */
+
+ || (trailBytesSeen >= totalBytes[byte])) {
+ /*
+ * That is, (1 + trailBytesSeen > needed).
+ * We've examined more bytes than needed to complete
+ * this lead byte. No matter about well-formedness or
+ * validity, the sequence starting with this lead byte
+ * will never include the fallback location, so we must
+ * return the fallback location. See test utf-7.17
+ */
+ return fallback;
}
- return look;
+
+ /*
+ * trailBytesSeen > 0, so we can examine look[1] safely.
+ * Use that capability to screen out overlong sequences.
+ */
+
+ if (Invalid(look)) {
+ /* Reject */
+ return fallback;
+ }
+ return (CONST char *)look;
+ }
+
+ /* We saw a trail byte. */
+ trailBytesSeen++;
+
+ if ((CONST char *)look == start) {
+ /*
+ * Do not read before the start of the string
+ *
+ * If we get here, we've examined bytes at every location
+ * >= start and < src and all of them are trail bytes,
+ * including (*start). We need to return our fallback
+ * and exit this loop before we run past the start of the string.
+ */
+ return fallback;
}
+
+ /* Continue the search backwards... */
look--;
- }
- return src;
+ } while (trailBytesSeen < 4);
+
+ /*
+ * We've seen TCL_UTF_MAX trail bytes, so we know there will not be a
+ * properly formed byte sequence to find, and we can stop looking,
+ * accepting the fallback (for TCL_UTF_MAX > 3) or just go back as
+ * far as we can.
+ */
+ return fallback;
}
/*
diff --git a/generic/tclUtil.c b/generic/tclUtil.c
index b48ce30..dd527dc 100644
--- a/generic/tclUtil.c
+++ b/generic/tclUtil.c
@@ -1707,7 +1707,7 @@ TclTrimRight(
const char *q = trim;
int pInc = 0, bytesLeft = numTrim;
- pp = Tcl_UtfPrev(p, bytes);
+ pp = TclUtfPrev(p, bytes);
do {
pp += pInc;
pInc = TclUtfToUCS4(pp, &ch1);
@@ -1858,7 +1858,7 @@ TclTrim(
* that we will not trim. Skip over it. */
if (numBytes > 0) {
const char *first = bytes + trimLeft;
- bytes = Tcl_UtfNext(first);
+ bytes = TclUtfNext(first);
numBytes -= (bytes - first);
if (numBytes > 0) {
diff --git a/tests/utf.test b/tests/utf.test
index 79f866a..885b057 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -600,19 +600,19 @@ test utf-7.23 {Tcl_UtfPrev} testutfprev {
} 4
test utf-7.24 {Tcl_UtfPrev -- overlong sequence} testutfprev {
testutfprev A\xC0\x81
-} 1
+} 2
test utf-7.25 {Tcl_UtfPrev -- overlong sequence} testutfprev {
testutfprev A\xC0\x81 2
} 1
test utf-7.26 {Tcl_UtfPrev -- overlong sequence} testutfprev {
testutfprev A\xE0\x80\x80
-} 1
+} 3
test utf-7.27 {Tcl_UtfPrev -- overlong sequence} testutfprev {
testutfprev A\xE0\x80
-} 1
+} 2
test utf-7.27.1 {Tcl_UtfPrev -- overlong sequence} testutfprev {
testutfprev A\xE0\x80\x80 3
-} 1
+} 2
test utf-7.28 {Tcl_UtfPrev -- overlong sequence} testutfprev {
testutfprev A\xE0
} 1
@@ -621,13 +621,13 @@ test utf-7.28.1 {Tcl_UtfPrev -- overlong sequence} testutfprev {
} 1
test utf-7.29 {Tcl_UtfPrev -- overlong sequence} testutfprev {
testutfprev A\xF0\x80\x80\x80
-} 1
+} 4
test utf-7.30 {Tcl_UtfPrev -- overlong sequence} testutfprev {
testutfprev A\xF0\x80\x80\x80 4
-} 1
+} 3
test utf-7.31 {Tcl_UtfPrev -- overlong sequence} testutfprev {
testutfprev A\xF0\x80\x80\x80 3
-} 1
+} 2
test utf-7.32 {Tcl_UtfPrev -- overlong sequence} testutfprev {
testutfprev A\xF0\x80\x80\x80 2
} 1
@@ -696,13 +696,13 @@ test utf-7.48.3 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev {
} 1
test utf-7.49 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev {
testutfprev A\xF4\x90\x80\x80
-} 1
+} 4
test utf-7.49.1 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev {
testutfprev A\xF4\x90\x80\x80 4
-} 1
+} 3
test utf-7.49.2 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev {
testutfprev A\xF4\x90\x80\x80 3
-} 1
+} 2
test utf-7.49.3 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev {
testutfprev A\xF4\x90\x80\x80 2
} 1