Merge 8.6

author: jan.nijtmans <nijtmans@users.sourceforge.net> 2020-04-14 10:17:31 (GMT)
committer: jan.nijtmans <nijtmans@users.sourceforge.net> 2020-04-14 10:17:31 (GMT)
commit: e59db7e00e94f016d7c222aea7603dbbc8eecb4e (patch)
tree: 49eea3f1d82a1ac023889575a2e07d7643ad4b41 /generic/tclUtf.c
parent: 2f98c2ea4d9b29dc3a797522a457585ac5865388 (diff)
parent: 920063dce71227734c3cd38eea46fd644ec37ded (diff)
download: tcl-e59db7e00e94f016d7c222aea7603dbbc8eecb4e.zip
tcl-e59db7e00e94f016d7c222aea7603dbbc8eecb4e.tar.gz
tcl-e59db7e00e94f016d7c222aea7603dbbc8eecb4e.tar.bz2
1 files changed, 53 insertions, 12 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 8d1371a..5908f36 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -64,6 +64,17 @@ static const unsigned char totalBytes[256] = {
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1
+};
+
+static const unsigned char complete[256] = {
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 /* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */
     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
@@ -697,7 +708,7 @@ Tcl_UtfCharComplete(
 				 * a complete UTF-8 character. */
     int length)			/* Length of above string in bytes. */
 {
-    return length >= totalBytes[(unsigned char)*src];
+    return length >= complete[(unsigned char)*src];
 }
 
 /*
@@ -875,15 +886,43 @@ Tcl_UtfNext(
  *
  * Tcl_UtfPrev --
  *
- *	Given a pointer to some current location in a UTF-8 string, move
- *	backwards one character. This works correctly when the pointer is in
- *	the middle of a UTF-8 character.
+ *	The aim of this routine is to provide a way to move backward
+ *	through a UTF-8 string. The caller is expected to pass non-NULL
+ *	pointer arguments start and src. start points to the beginning
+ *	of a string, and src >= start points to a location within (or just
+ *	past the end) of the string. This routine always returns a
+ *	pointer within the string (>= start).  When (src == start), it
+ *	returns start. When (src > start), it returns a pointer (< src)
+ *	and (>= src - TCL_UTF_MAX).  Subject to these constraints, the
+ *	routine returns a pointer to the earliest byte in the string that
+ *	starts a character when characters are read starting at start and
+ *	that character might include the byte src[-1]. The routine will
+ *	examine only those bytes in the range that might be returned.
+ *	It will not examine the byte *src, and because of that cannot 
+ *	determine for certain in all circumstances whether the character
+ *	that begins with the returned pointer will or will not include
+ *	the byte src[-1]. In the scenario, where src points to the end of
+ *	a buffer being filled, the returned pointer point to either the
+ *	final complete character in the string or to the earliest byte
+ *	that might start an incomplete character waiting for more bytes to
+ *	complete.
+ *
+ *	Because this routine always returns a value < src until the point
+ *	it is forced to return start, it is useful as a backward iterator
+ *	through a string that will always make progress and always be
+ *	prevented from running past the beginning of the string.
+ *
+ *	In a string where all characters are complete and properly formed,
+ *	and the value of src points to the first byte of a character, 
+ *	repeated Tcl_UtfPrev calls will step to the starting bytes of
+ *	characters, one character at a time. Within those limitations,
+ *	Tcl_UtfPrev and Tcl_UtfNext are inverses. If either condition cannot
+ *	be met, Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and
+ *	the caller will have to take greater care.
  *
  * Results:
- *	The return value is a pointer to the previous character in the UTF-8
- *	string. If the current location was already at the beginning of the
- *	string, the return value will also be a pointer to the beginning of
- *	the string.
+ *	A pointer to the start of a character in the string as described
+ *	above.
  *
  * Side effects:
  *	None.
@@ -893,9 +932,8 @@ Tcl_UtfNext(
 
 const char *
 Tcl_UtfPrev(
-    const char *src,		/* The current location in the string. */
-    const char *start)		/* Pointer to the beginning of the string, to
-				 * avoid going backwards too far. */
+    const char *src,		/* A location in a UTF-8 string. */
+    const char *start)		/* Pointer to the beginning of the string */
 {
     const char *look;
     int i, byte;
@@ -913,6 +951,9 @@ Tcl_UtfPrev(
 	    break;
 	}
 	if (byte >= 0xC0) {
+	    if (totalBytes[byte] <= i) {
+		break;
+	    }
 	    return look;
 	}
 	look--;
@@ -1975,7 +2016,7 @@ Tcl_UniCharIsSpace(
      */
 
     if (ch < 0x80) {
-	return TclIsSpaceProc((char) ch);
+	return TclIsSpaceProcM((char) ch);
     } else if (UNICODE_OUT_OF_RANGE(ch)) {
 	return 0;
     } else if (ch == 0x0085 || ch == 0x180E || ch == 0x200B
author	jan.nijtmans <nijtmans@users.sourceforge.net>	2020-04-14 10:17:31 (GMT)
committer	jan.nijtmans <nijtmans@users.sourceforge.net>	2020-04-14 10:17:31 (GMT)
commit	e59db7e00e94f016d7c222aea7603dbbc8eecb4e (patch)
tree	49eea3f1d82a1ac023889575a2e07d7643ad4b41 /generic/tclUtf.c
parent	2f98c2ea4d9b29dc3a797522a457585ac5865388 (diff)
parent	920063dce71227734c3cd38eea46fd644ec37ded (diff)
download	tcl-e59db7e00e94f016d7c222aea7603dbbc8eecb4e.zip tcl-e59db7e00e94f016d7c222aea7603dbbc8eecb4e.tar.gz tcl-e59db7e00e94f016d7c222aea7603dbbc8eecb4e.tar.bz2