diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2022-03-29 22:17:33 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2022-03-29 22:17:33 (GMT) |
commit | a2a0ca2a1734d496f9500dc2ae02772a9eac548d (patch) | |
tree | 0a13f841c77376f33919ac3b86328af05bc8bf03 /generic/tclUtf.c | |
parent | b19862400b0fde8a9ee9233384130eb36164597a (diff) | |
parent | 60e9b38dfbeb8b382fd60528363fe726331ac4db (diff) | |
download | tcl-a2a0ca2a1734d496f9500dc2ae02772a9eac548d.zip tcl-a2a0ca2a1734d496f9500dc2ae02772a9eac548d.tar.gz tcl-a2a0ca2a1734d496f9500dc2ae02772a9eac548d.tar.bz2 |
Merge 9.0. Fix CONTINUATION macro, and testcases
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 91 |
1 files changed, 76 insertions, 15 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index a04e41c..deb6d3e 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -811,6 +811,7 @@ Tcl_UtfCharComplete( *--------------------------------------------------------------------------- */ +#undef Tcl_NumUtfChars size_t Tcl_NumUtfChars( const char *src, /* The UTF-8 string to measure. */ @@ -863,6 +864,58 @@ Tcl_NumUtfChars( return i; } +size_t +TclNumUtfChars( + const char *src, /* The UTF-8 string to measure. */ + size_t length) /* The length of the string in bytes, or + * TCL_INDEX_NONE for strlen(src). */ +{ + unsigned short ch = 0; + size_t i = 0; + + if (length == TCL_INDEX_NONE) { + /* string is NUL-terminated, so TclUtfToUniChar calls are safe. */ + while (*src != '\0') { + src += Tcl_UtfToChar16(src, &ch); + i++; + } + } else { + /* Will return value between 0 and length. No overflow checks. */ + + /* Pointer to the end of string. Never read endPtr[0] */ + const char *endPtr = src + length; + /* Pointer to last byte where optimization still can be used */ + const char *optPtr = endPtr - 4; + + /* + * Optimize away the call in this loop. Justified because... + * when (src <= optPtr), (endPtr - src) >= (endPtr - optPtr) + * By initialization above (endPtr - optPtr) = TCL_UTF_MAX + * So (endPtr - src) >= TCL_UTF_MAX, and passing that to + * Tcl_UtfCharComplete we know will cause return of 1. + */ + while (src <= optPtr + /* && Tcl_UtfCharComplete(src, endPtr - src) */ ) { + src += Tcl_UtfToChar16(src, &ch); + i++; + } + /* Loop over the remaining string where call must happen */ + while (src < endPtr) { + if (Tcl_UtfCharComplete(src, endPtr - src)) { + src += Tcl_UtfToChar16(src, &ch); + } else { + /* + * src points to incomplete UTF-8 sequence + * Treat first byte as character and count it + */ + src++; + } + i++; + } + } + return i; +} + /* *--------------------------------------------------------------------------- * @@ -1179,34 +1232,42 @@ Tcl_UniCharAtIndex( *--------------------------------------------------------------------------- */ +#undef Tcl_UtfAtIndex const char * Tcl_UtfAtIndex( const char *src, /* The UTF-8 string. */ size_t index) /* The position of the desired character. */ { - Tcl_UniChar ch = 0; -#if TCL_UTF_MAX < 4 - size_t len = 0; -#endif + int ch = 0; if (index != TCL_INDEX_NONE) { while (index--) { -#if TCL_UTF_MAX < 4 - src += (len = TclUtfToUniChar(src, &ch)); -#else - src += TclUtfToUniChar(src, &ch); -#endif + src += Tcl_UtfToUniChar(src, &ch); } -#if TCL_UTF_MAX < 4 - if ((ch >= 0xD800) && (len < 3)) { - /* Index points at character following high Surrogate */ - src += TclUtfToUniChar(src, &ch); - } -#endif } return src; } +const char * +TclUtfAtIndex( + const char *src, /* The UTF-8 string. */ + size_t index) /* The position of the desired character. */ +{ + unsigned short ch = 0; + size_t len = 0; + + if (index != TCL_INDEX_NONE) { + while (index--) { + src += (len = Tcl_UtfToChar16(src, &ch)); + } + if ((ch >= 0xD800) && (len < 3)) { + /* Index points at character following high Surrogate */ + src += Tcl_UtfToChar16(src, &ch); + } + } + return src; +} + /* *--------------------------------------------------------------------------- * |