summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2022-03-22 15:38:25 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2022-03-22 15:38:25 (GMT)
commitb5e6f95ea90be59cb281c089806f0e446e1272bc (patch)
tree5b448c643d378c730f29b9b7892a6a7f93b31b98 /generic/tclUtf.c
parent7d893da1b984ded235163f3ec8018195d9058f2a (diff)
downloadtcl-b5e6f95ea90be59cb281c089806f0e446e1272bc.zip
tcl-b5e6f95ea90be59cb281c089806f0e446e1272bc.tar.gz
tcl-b5e6f95ea90be59cb281c089806f0e446e1272bc.tar.bz2
Feature-complete
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c57
1 files changed, 56 insertions, 1 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 4dd1e09..eda317f 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -799,7 +799,7 @@ Tcl_UtfCharComplete(
*/
int
-Tcl_NumUtfChars(
+TclNumUtfChars(
const char *src, /* The UTF-8 string to measure. */
int length) /* The length of the string in bytes, or -1
* for strlen(string). */
@@ -850,6 +850,61 @@ Tcl_NumUtfChars(
return i;
}
+#if TCL_UTF_MAX > 3
+#undef Tcl_NumUtfChars
+int
+Tcl_NumUtfChars(
+ const char *src, /* The UTF-8 string to measure. */
+ int length) /* The length of the string in bytes, or -1
+ * for strlen(string). */
+{
+ unsigned short ch = 0;
+ int i = 0;
+
+ if (length < 0) {
+ /* string is NUL-terminated, so TclUtfToUniChar calls are safe. */
+ while ((*src != '\0') && (i < INT_MAX)) {
+ src += Tcl_UtfToChar16(src, &ch);
+ i++;
+ }
+ } else {
+ /* Will return value between 0 and length. No overflow checks. */
+
+ /* Pointer to the end of string. Never read endPtr[0] */
+ const char *endPtr = src + length;
+ /* Pointer to last byte where optimization still can be used */
+ const char *optPtr = endPtr - 4;
+
+ /*
+ * Optimize away the call in this loop. Justified because...
+ * when (src <= optPtr), (endPtr - src) >= (endPtr - optPtr)
+ * By initialization above (endPtr - optPtr) = TCL_UTF_MAX
+ * So (endPtr - src) >= TCL_UTF_MAX, and passing that to
+ * Tcl_UtfCharComplete we know will cause return of 1.
+ */
+ while (src <= optPtr
+ /* && Tcl_UtfCharComplete(src, endPtr - src) */ ) {
+ src += Tcl_UtfToChar16(src, &ch);
+ i++;
+ }
+ /* Loop over the remaining string where call must happen */
+ while (src < endPtr) {
+ if (Tcl_UtfCharComplete(src, endPtr - src)) {
+ src += Tcl_UtfToChar16(src, &ch);
+ } else {
+ /*
+ * src points to incomplete UTF-8 sequence
+ * Treat first byte as character and count it
+ */
+ src++;
+ }
+ i++;
+ }
+ }
+ return i;
+}
+#endif
+
/*
*---------------------------------------------------------------------------
*