summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2020-04-14 10:17:31 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2020-04-14 10:17:31 (GMT)
commite59db7e00e94f016d7c222aea7603dbbc8eecb4e (patch)
tree49eea3f1d82a1ac023889575a2e07d7643ad4b41 /generic/tclUtf.c
parent2f98c2ea4d9b29dc3a797522a457585ac5865388 (diff)
parent920063dce71227734c3cd38eea46fd644ec37ded (diff)
downloadtcl-e59db7e00e94f016d7c222aea7603dbbc8eecb4e.zip
tcl-e59db7e00e94f016d7c222aea7603dbbc8eecb4e.tar.gz
tcl-e59db7e00e94f016d7c222aea7603dbbc8eecb4e.tar.bz2
Merge 8.6
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c65
1 files changed, 53 insertions, 12 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 8d1371a..5908f36 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -64,6 +64,17 @@ static const unsigned char totalBytes[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1
+};
+
+static const unsigned char complete[256] = {
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
/* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
@@ -697,7 +708,7 @@ Tcl_UtfCharComplete(
* a complete UTF-8 character. */
int length) /* Length of above string in bytes. */
{
- return length >= totalBytes[(unsigned char)*src];
+ return length >= complete[(unsigned char)*src];
}
/*
@@ -875,15 +886,43 @@ Tcl_UtfNext(
*
* Tcl_UtfPrev --
*
- * Given a pointer to some current location in a UTF-8 string, move
- * backwards one character. This works correctly when the pointer is in
- * the middle of a UTF-8 character.
+ * The aim of this routine is to provide a way to move backward
+ * through a UTF-8 string. The caller is expected to pass non-NULL
+ * pointer arguments start and src. start points to the beginning
+ * of a string, and src >= start points to a location within (or just
+ * past the end) of the string. This routine always returns a
+ * pointer within the string (>= start). When (src == start), it
+ * returns start. When (src > start), it returns a pointer (< src)
+ * and (>= src - TCL_UTF_MAX). Subject to these constraints, the
+ * routine returns a pointer to the earliest byte in the string that
+ * starts a character when characters are read starting at start and
+ * that character might include the byte src[-1]. The routine will
+ * examine only those bytes in the range that might be returned.
+ * It will not examine the byte *src, and because of that cannot
+ * determine for certain in all circumstances whether the character
+ * that begins with the returned pointer will or will not include
+ * the byte src[-1]. In the scenario, where src points to the end of
+ * a buffer being filled, the returned pointer point to either the
+ * final complete character in the string or to the earliest byte
+ * that might start an incomplete character waiting for more bytes to
+ * complete.
+ *
+ * Because this routine always returns a value < src until the point
+ * it is forced to return start, it is useful as a backward iterator
+ * through a string that will always make progress and always be
+ * prevented from running past the beginning of the string.
+ *
+ * In a string where all characters are complete and properly formed,
+ * and the value of src points to the first byte of a character,
+ * repeated Tcl_UtfPrev calls will step to the starting bytes of
+ * characters, one character at a time. Within those limitations,
+ * Tcl_UtfPrev and Tcl_UtfNext are inverses. If either condition cannot
+ * be met, Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and
+ * the caller will have to take greater care.
*
* Results:
- * The return value is a pointer to the previous character in the UTF-8
- * string. If the current location was already at the beginning of the
- * string, the return value will also be a pointer to the beginning of
- * the string.
+ * A pointer to the start of a character in the string as described
+ * above.
*
* Side effects:
* None.
@@ -893,9 +932,8 @@ Tcl_UtfNext(
const char *
Tcl_UtfPrev(
- const char *src, /* The current location in the string. */
- const char *start) /* Pointer to the beginning of the string, to
- * avoid going backwards too far. */
+ const char *src, /* A location in a UTF-8 string. */
+ const char *start) /* Pointer to the beginning of the string */
{
const char *look;
int i, byte;
@@ -913,6 +951,9 @@ Tcl_UtfPrev(
break;
}
if (byte >= 0xC0) {
+ if (totalBytes[byte] <= i) {
+ break;
+ }
return look;
}
look--;
@@ -1975,7 +2016,7 @@ Tcl_UniCharIsSpace(
*/
if (ch < 0x80) {
- return TclIsSpaceProc((char) ch);
+ return TclIsSpaceProcM((char) ch);
} else if (UNICODE_OUT_OF_RANGE(ch)) {
return 0;
} else if (ch == 0x0085 || ch == 0x180E || ch == 0x200B