Fix regression in Tcl_NumUtfChars, caused by this commit: [6596c4af31e29b5d]. Expectations of failing tests was adapted later, that's why this was missed.

Lesson: Tcl_UtfNext() is _not_ just an optimized replacement for Tcl_UtfToUniChar(). Sorry, but this change it just to dangerous! Tcl_UniCharAtIndex() and Tcl_UtfAtIndex() most likely have the same regression when fead with invalid byte-sequences, therefore reverted those too. HOLD ON! These regressions are equally the result of [5c322bbd51]. It takes both changes to cause the failing tests. We need to argue about which change was the wrong one.
author: jan.nijtmans <nijtmans@users.sourceforge.net> 2020-04-23 19:07:08 (GMT)
committer: jan.nijtmans <nijtmans@users.sourceforge.net> 2020-04-23 19:07:08 (GMT)
commit: e7186db8a96017cbfe8baf62cb3a23ce279c1bb0 (patch)
tree: b0444a35fd26426f84187127afebdb19810af167
parent: 2ca7ab9af0d59c9907dde3d844e1785d33df4812 (diff)
download: tcl-e7186db8a96017cbfe8baf62cb3a23ce279c1bb0.zip
tcl-e7186db8a96017cbfe8baf62cb3a23ce279c1bb0.tar.gz
tcl-e7186db8a96017cbfe8baf62cb3a23ce279c1bb0.tar.bz2
2 files changed, 28 insertions, 69 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 0e11e0e..e095555 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -579,7 +579,7 @@ Tcl_NumUtfChars(
     int length)			/* The length of the string in bytes, or -1
 				 * for strlen(string). */
 {
-    const char *next;
+    Tcl_UniChar ch = 0;
     register int i = 0;
 
     /*
@@ -590,36 +590,22 @@ Tcl_NumUtfChars(
      */
 
     if (length < 0) {
-	while ((*src != '\0') && (i < INT_MAX)) {
-	    next = TclUtfNext(src);
-#if TCL_UTF_MAX > 4
+	while (*src != '\0') {
+	    src += TclUtfToUniChar(src, &ch);
 	    i++;
-#else
-	    i += 1 + ((next - src) > 3);
-#endif
-	    src = next;
 	}
+	if (i < 0) i = INT_MAX; /* Bug [2738427] */
     } else {
 	register const char *endPtr = src + length - TCL_UTF_MAX;
 
 	while (src < endPtr) {
-	    next = TclUtfNext(src);
-#if TCL_UTF_MAX > 4
+	    src += TclUtfToUniChar(src, &ch);
 	    i++;
-#else
-	    i += 1 + ((next - src) > 3);
-#endif
-	    src = next;
 	}
 	endPtr += TCL_UTF_MAX;
 	while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
-	    next = TclUtfNext(src);
-#if TCL_UTF_MAX > 4
+	    src += TclUtfToUniChar(src, &ch);
 	    i++;
-#else
-	    i += 1 + ((next - src) > 3);
-#endif
-	    src = next;
 	}
 	if (src < endPtr) {
 	    i += endPtr - src;
@@ -762,43 +748,15 @@ Tcl_UtfNext(
  *
  * Tcl_UtfPrev --
  *
- *	The aim of this routine is to provide a way to move backward
- *	through a UTF-8 string. The caller is expected to pass non-NULL
- *	pointer arguments start and src. start points to the beginning
- *	of a string, and src >= start points to a location within (or just
- *	past the end) of the string. This routine always returns a
- *	pointer within the string (>= start).  When (src == start), it
- *	returns start. When (src > start), it returns a pointer (< src)
- *	and (>= src - TCL_UTF_MAX).  Subject to these constraints, the
- *	routine returns a pointer to the earliest byte in the string that
- *	starts a character when characters are read starting at start and
- *	that character might include the byte src[-1]. The routine will
- *	examine only those bytes in the range that might be returned.
- *	It will not examine the byte *src, and because of that cannot
- *	determine for certain in all circumstances whether the character
- *	that begins with the returned pointer will or will not include
- *	the byte src[-1]. In the scenario, where src points to the end of
- *	a buffer being filled, the returned pointer points to either the
- *	final complete character in the string or to the earliest byte
- *	that might start an incomplete character waiting for more bytes to
- *	complete.
- *
- *	Because this routine always returns a value < src until the point
- *	it is forced to return start, it is useful as a backward iterator
- *	through a string that will always make progress and always be
- *	prevented from running past the beginning of the string.
- *
- *	In a string where all characters are complete and properly formed,
- *	and the value of src points to the first byte of a character,
- *	repeated Tcl_UtfPrev calls will step to the starting bytes of
- *	characters, one character at a time. Within those limitations,
- *	Tcl_UtfPrev and Tcl_UtfNext are inverses. If either condition cannot
- *	be met, Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and
- *	the caller will have to take greater care.
+ *	Given a pointer to some current location in a UTF-8 string, move
+ *	backwards one character. This works correctly when the pointer is in
+ *	the middle of a UTF-8 character.
  *
  * Results:
- *	A pointer to the start of a character in the string as described
- *	above.
+ *	The return value is a pointer to the previous character in the UTF-8
+ *	string. If the current location was already at the beginning of the
+ *	string, the return value will also be a pointer to the beginning of
+ *	the string.
  *
  * Side effects:
  *	None.
@@ -927,7 +885,9 @@ Tcl_UniCharAtIndex(
 {
     Tcl_UniChar ch = 0;
 
-    TclUtfToUniChar(Tcl_UtfAtIndex(src, index), &ch);
+    while (index-- >= 0) {
+	src += TclUtfToUniChar(src, &ch);
+    }
     return ch;
 }
 
@@ -953,20 +913,19 @@ Tcl_UtfAtIndex(
     register const char *src,	/* The UTF-8 string. */
     register int index)		/* The position of the desired character. */
 {
-    while (index-- > 0) {
-	const char *next = TclUtfNext(src);
-
-#if TCL_UTF_MAX <= 4
-	/*
-	 * 4-byte sequences generate two UCS-2 code units in the
-	 * UTF-16 representation, so in the current indexing scheme
-	 * we need to account for an extra index (total of two).
-	 */
-	index -= ((next - src) > 3);
-#endif
+    Tcl_UniChar ch = 0;
+    int len = 0;
 
-	src = next;
+    while (index-- > 0) {
+	len = TclUtfToUniChar(src, &ch);
+	src += len;
     }
+#if TCL_UTF_MAX == 4
+    if ((ch >= 0xD800) && (len < 3)) {
+	/* Index points at character following high Surrogate */
+	src += TclUtfToUniChar(src, &ch);
+    }
+#endif
     return src;
 }
 
diff --git a/tests/utf.test b/tests/utf.test
index cb650f4..3f20ace 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -147,7 +147,7 @@ test utf-4.11 {Tcl_NumUtfChars: 3 bytes of 4-byte UTF-8 characater} {testnumutfc
 } 3
 test utf-4.12.0 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring ucs2} {
     testnumutfchars [testbytestring \xF0\x9F\x92\xA9] 4
-} 4
+} 2
 test utf-4.12.1 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring tip389} {
     testnumutfchars [testbytestring \xF0\x9F\x92\xA9] 4
 } 2
author	jan.nijtmans <nijtmans@users.sourceforge.net>	2020-04-23 19:07:08 (GMT)
committer	jan.nijtmans <nijtmans@users.sourceforge.net>	2020-04-23 19:07:08 (GMT)
commit	e7186db8a96017cbfe8baf62cb3a23ce279c1bb0 (patch)
tree	b0444a35fd26426f84187127afebdb19810af167
parent	2ca7ab9af0d59c9907dde3d844e1785d33df4812 (diff)
download	tcl-e7186db8a96017cbfe8baf62cb3a23ce279c1bb0.zip tcl-e7186db8a96017cbfe8baf62cb3a23ce279c1bb0.tar.gz tcl-e7186db8a96017cbfe8baf62cb3a23ce279c1bb0.tar.bz2