diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2022-03-22 15:38:25 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2022-03-22 15:38:25 (GMT) |
commit | b5e6f95ea90be59cb281c089806f0e446e1272bc (patch) | |
tree | 5b448c643d378c730f29b9b7892a6a7f93b31b98 /generic/tclUtf.c | |
parent | 7d893da1b984ded235163f3ec8018195d9058f2a (diff) | |
download | tcl-b5e6f95ea90be59cb281c089806f0e446e1272bc.zip tcl-b5e6f95ea90be59cb281c089806f0e446e1272bc.tar.gz tcl-b5e6f95ea90be59cb281c089806f0e446e1272bc.tar.bz2 |
Feature-complete
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 57 |
1 files changed, 56 insertions, 1 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 4dd1e09..eda317f 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -799,7 +799,7 @@ Tcl_UtfCharComplete( */ int -Tcl_NumUtfChars( +TclNumUtfChars( const char *src, /* The UTF-8 string to measure. */ int length) /* The length of the string in bytes, or -1 * for strlen(string). */ @@ -850,6 +850,61 @@ Tcl_NumUtfChars( return i; } +#if TCL_UTF_MAX > 3 +#undef Tcl_NumUtfChars +int +Tcl_NumUtfChars( + const char *src, /* The UTF-8 string to measure. */ + int length) /* The length of the string in bytes, or -1 + * for strlen(string). */ +{ + unsigned short ch = 0; + int i = 0; + + if (length < 0) { + /* string is NUL-terminated, so TclUtfToUniChar calls are safe. */ + while ((*src != '\0') && (i < INT_MAX)) { + src += Tcl_UtfToChar16(src, &ch); + i++; + } + } else { + /* Will return value between 0 and length. No overflow checks. */ + + /* Pointer to the end of string. Never read endPtr[0] */ + const char *endPtr = src + length; + /* Pointer to last byte where optimization still can be used */ + const char *optPtr = endPtr - 4; + + /* + * Optimize away the call in this loop. Justified because... + * when (src <= optPtr), (endPtr - src) >= (endPtr - optPtr) + * By initialization above (endPtr - optPtr) = TCL_UTF_MAX + * So (endPtr - src) >= TCL_UTF_MAX, and passing that to + * Tcl_UtfCharComplete we know will cause return of 1. + */ + while (src <= optPtr + /* && Tcl_UtfCharComplete(src, endPtr - src) */ ) { + src += Tcl_UtfToChar16(src, &ch); + i++; + } + /* Loop over the remaining string where call must happen */ + while (src < endPtr) { + if (Tcl_UtfCharComplete(src, endPtr - src)) { + src += Tcl_UtfToChar16(src, &ch); + } else { + /* + * src points to incomplete UTF-8 sequence + * Treat first byte as character and count it + */ + src++; + } + i++; + } + } + return i; +} +#endif + /* *--------------------------------------------------------------------------- * |