From c2fa3a3fa94ad6516014f5376cb9d97e8b5550bb Mon Sep 17 00:00:00 2001 From: dgp Date: Wed, 6 May 2020 21:08:35 +0000 Subject: Tighten optimization in Tcl_NumUtfChars. Explain in comments. --- generic/tclUtf.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 24ec3d2..6a142bc 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -512,38 +512,49 @@ Tcl_UtfCharComplete( int Tcl_NumUtfChars( - register CONST char *src, /* The UTF-8 string to measure. */ - int length) /* The length of the string in bytes, or -1 - * for strlen(string). */ + CONST char *src, /* The UTF-8 string to measure. */ + int length) /* The length of the string in bytes, or -1 + * for strlen(string). */ { Tcl_UniChar ch; - register int i = 0; - - /* - * The separate implementations are faster. - * - * Since this is a time-sensitive function, we also do the check for the - * single-byte char case specially. - */ + int i = 0; if (length < 0) { + /* string is NUL-terminated, so TclUtfToUniChar calls are safe. */ while ((*src != '\0') && (i < INT_MAX)) { src += TclUtfToUniChar(src, &ch); i++; } } else { - register const char *endPtr = src + length - TCL_UTF_MAX; + /* Will return value between 0 and length. No overflow checks. */ + + /* Pointer to the end of string. Never read endPtr[0] */ + const char *endPtr = src + length; + /* Pointer to breakpoint in scan where optimization is lost */ + const char *optPtr = endPtr - TCL_UTF_MAX + 1; - while (src < endPtr) { + /* + * Optimize away the call in this loop. Justified because... + * when (src < optPtr), (endPtr - src) > (endPtr - optPtr) + * By initialization above (endPtr - optPtr) = TCL_UTF_MAX - 1 + * So (endPtr - src) >= TCL_UTF_MAX, and passing that to + * Tcl_UtfCharComplete we know will cause return of 1. + */ + while ((src < optPtr) + /* && Tcl_UtfCharComplete(src, endPtr - src) */ ) { src += TclUtfToUniChar(src, &ch); i++; } - endPtr += TCL_UTF_MAX; + /* Loop over the remaining string where call must happen */ while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) { src += TclUtfToUniChar(src, &ch); i++; } if (src < endPtr) { + /* + * String ends in an incomplete UTF-8 sequence. + * Count every byte in it. + */ i += endPtr - src; } } -- cgit v0.12