summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--generic/tclUtf.c38
1 files changed, 24 insertions, 14 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 524d7ad..43958af 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -588,29 +588,35 @@ Tcl_UtfCharComplete(
int
Tcl_NumUtfChars(
const char *src, /* The UTF-8 string to measure. */
- int length) /* The length of the string in bytes, or -1
- * for strlen(string). */
+ int length) /* The length of the string in bytes, or -1
+ * for strlen(string). */
{
Tcl_UniChar ch = 0;
int i = 0;
- /*
- * The separate implementations are faster.
- *
- * Since this is a time-sensitive function, we also do the check for the
- * single-byte char case specially.
- */
-
if (length < 0) {
- while (*src != '\0') {
+ /* string is NUL-terminated, so TclUtfToUniChar calls are safe. */
+ while ((*src != '\0') && (i < INT_MAX)) {
src += TclUtfToUniChar(src, &ch);
i++;
}
- if (i < 0) i = INT_MAX; /* Bug [2738427] */
} else {
- const char *endPtr = src + length - TCL_UTF_MAX;
+ /* Will return value between 0 and length. No overflow checks. */
+
+ /* Pointer to the end of string. Never read endPtr[0] */
+ const char *endPtr = src + length;
+ /* Pointer to breakpoint in scan where optimization is lost */
+ const char *optPtr = endPtr - TCL_UTF_MAX + 1;
- while (src < endPtr) {
+ /*
+ * Optimize away the call in this loop. Justified because...
+ * when (src < optPtr), (endPtr - src) > (endPtr - optPtr)
+ * By initialization above (endPtr - optPtr) = TCL_UTF_MAX - 1
+ * So (endPtr - src) >= TCL_UTF_MAX, and passing that to
+ * Tcl_UtfCharComplete we know will cause return of 1.
+ */
+ while ((src < optPtr)
+ /* && Tcl_UtfCharComplete(src, endPtr - src) */ ) {
#if TCL_UTF_MAX < 4
if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
/* treat F0 - F4 as single character */
@@ -621,7 +627,7 @@ Tcl_NumUtfChars(
src += TclUtfToUniChar(src, &ch);
i++;
}
- endPtr += TCL_UTF_MAX;
+ /* Loop over the remaining string where call must happen */
while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
#if TCL_UTF_MAX < 4
if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
@@ -634,6 +640,10 @@ Tcl_NumUtfChars(
i++;
}
if (src < endPtr) {
+ /*
+ * String ends in an incomplete UTF-8 sequence.
+ * Count every byte in it.
+ */
i += endPtr - src;
}
}