diff options
| -rw-r--r-- | generic/tclUtf.c | 38 |
1 files changed, 24 insertions, 14 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 524d7ad..43958af 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -588,29 +588,35 @@ Tcl_UtfCharComplete( int Tcl_NumUtfChars( const char *src, /* The UTF-8 string to measure. */ - int length) /* The length of the string in bytes, or -1 - * for strlen(string). */ + int length) /* The length of the string in bytes, or -1 + * for strlen(string). */ { Tcl_UniChar ch = 0; int i = 0; - /* - * The separate implementations are faster. - * - * Since this is a time-sensitive function, we also do the check for the - * single-byte char case specially. - */ - if (length < 0) { - while (*src != '\0') { + /* string is NUL-terminated, so TclUtfToUniChar calls are safe. */ + while ((*src != '\0') && (i < INT_MAX)) { src += TclUtfToUniChar(src, &ch); i++; } - if (i < 0) i = INT_MAX; /* Bug [2738427] */ } else { - const char *endPtr = src + length - TCL_UTF_MAX; + /* Will return value between 0 and length. No overflow checks. */ + + /* Pointer to the end of string. Never read endPtr[0] */ + const char *endPtr = src + length; + /* Pointer to breakpoint in scan where optimization is lost */ + const char *optPtr = endPtr - TCL_UTF_MAX + 1; - while (src < endPtr) { + /* + * Optimize away the call in this loop. Justified because... + * when (src < optPtr), (endPtr - src) > (endPtr - optPtr) + * By initialization above (endPtr - optPtr) = TCL_UTF_MAX - 1 + * So (endPtr - src) >= TCL_UTF_MAX, and passing that to + * Tcl_UtfCharComplete we know will cause return of 1. + */ + while ((src < optPtr) + /* && Tcl_UtfCharComplete(src, endPtr - src) */ ) { #if TCL_UTF_MAX < 4 if (((unsigned)UCHAR(*src) - 0xF0) < 5) { /* treat F0 - F4 as single character */ @@ -621,7 +627,7 @@ Tcl_NumUtfChars( src += TclUtfToUniChar(src, &ch); i++; } - endPtr += TCL_UTF_MAX; + /* Loop over the remaining string where call must happen */ while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) { #if TCL_UTF_MAX < 4 if (((unsigned)UCHAR(*src) - 0xF0) < 5) { @@ -634,6 +640,10 @@ Tcl_NumUtfChars( i++; } if (src < endPtr) { + /* + * String ends in an incomplete UTF-8 sequence. + * Count every byte in it. + */ i += endPtr - src; } } |
