1 files changed, 24 insertions, 14 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 524d7ad..43958af 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -588,29 +588,35 @@ Tcl_UtfCharComplete(
 int
 Tcl_NumUtfChars(
     const char *src,	/* The UTF-8 string to measure. */
-    int length)			/* The length of the string in bytes, or -1
-				 * for strlen(string). */
+    int length)		/* The length of the string in bytes, or -1
+			 * for strlen(string). */
 {
     Tcl_UniChar ch = 0;
     int i = 0;
 
-    /*
-     * The separate implementations are faster.
-     *
-     * Since this is a time-sensitive function, we also do the check for the
-     * single-byte char case specially.
-     */
-
     if (length < 0) {
-	while (*src != '\0') {
+	/* string is NUL-terminated, so TclUtfToUniChar calls are safe. */
+	while ((*src != '\0') && (i < INT_MAX)) {
 	    src += TclUtfToUniChar(src, &ch);
 	    i++;
 	}
-	if (i < 0) i = INT_MAX; /* Bug [2738427] */
     } else {
-	const char *endPtr = src + length - TCL_UTF_MAX;
+	/* Will return value between 0 and length. No overflow checks. */
+
+	/* Pointer to the end of string. Never read endPtr[0] */
+	const char *endPtr = src + length;
+	/* Pointer to breakpoint in scan where optimization is lost */
+	const char *optPtr = endPtr - TCL_UTF_MAX + 1;
 
-	while (src < endPtr) {
+	/*
+	 * Optimize away the call in this loop. Justified because...
+	 *	when (src < optPtr), (endPtr - src) > (endPtr - optPtr)
+	 * By initialization above (endPtr - optPtr) = TCL_UTF_MAX - 1
+	 * So (endPtr - src) >= TCL_UTF_MAX, and passing that to
+	 * Tcl_UtfCharComplete we know will cause return of 1.
+	 */
+	while ((src < optPtr)
+		/* && Tcl_UtfCharComplete(src, endPtr - src) */ ) {
 #if TCL_UTF_MAX < 4
 	    if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
 		/* treat F0 - F4 as single character */
@@ -621,7 +627,7 @@ Tcl_NumUtfChars(
 	    src += TclUtfToUniChar(src, &ch);
 	    i++;
 	}
-	endPtr += TCL_UTF_MAX;
+	/* Loop over the remaining string where call must happen */
 	while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
 #if TCL_UTF_MAX < 4
 	    if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
@@ -634,6 +640,10 @@ Tcl_NumUtfChars(
 	    i++;
 	}
 	if (src < endPtr) {
+	    /*
+	     * String ends in an incomplete UTF-8 sequence.
+	     * Count every byte in it.
+	     */
 	    i += endPtr - src;
 	}
     }