diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2020-04-14 10:17:31 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2020-04-14 10:17:31 (GMT) |
commit | e59db7e00e94f016d7c222aea7603dbbc8eecb4e (patch) | |
tree | 49eea3f1d82a1ac023889575a2e07d7643ad4b41 /generic/tclUtf.c | |
parent | 2f98c2ea4d9b29dc3a797522a457585ac5865388 (diff) | |
parent | 920063dce71227734c3cd38eea46fd644ec37ded (diff) | |
download | tcl-e59db7e00e94f016d7c222aea7603dbbc8eecb4e.zip tcl-e59db7e00e94f016d7c222aea7603dbbc8eecb4e.tar.gz tcl-e59db7e00e94f016d7c222aea7603dbbc8eecb4e.tar.bz2 |
Merge 8.6
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r-- | generic/tclUtf.c | 65 |
1 files changed, 53 insertions, 12 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 8d1371a..5908f36 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -64,6 +64,17 @@ static const unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1 +}; + +static const unsigned char complete[256] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, @@ -697,7 +708,7 @@ Tcl_UtfCharComplete( * a complete UTF-8 character. */ int length) /* Length of above string in bytes. */ { - return length >= totalBytes[(unsigned char)*src]; + return length >= complete[(unsigned char)*src]; } /* @@ -875,15 +886,43 @@ Tcl_UtfNext( * * Tcl_UtfPrev -- * - * Given a pointer to some current location in a UTF-8 string, move - * backwards one character. This works correctly when the pointer is in - * the middle of a UTF-8 character. + * The aim of this routine is to provide a way to move backward + * through a UTF-8 string. The caller is expected to pass non-NULL + * pointer arguments start and src. start points to the beginning + * of a string, and src >= start points to a location within (or just + * past the end) of the string. This routine always returns a + * pointer within the string (>= start). When (src == start), it + * returns start. When (src > start), it returns a pointer (< src) + * and (>= src - TCL_UTF_MAX). Subject to these constraints, the + * routine returns a pointer to the earliest byte in the string that + * starts a character when characters are read starting at start and + * that character might include the byte src[-1]. The routine will + * examine only those bytes in the range that might be returned. + * It will not examine the byte *src, and because of that cannot + * determine for certain in all circumstances whether the character + * that begins with the returned pointer will or will not include + * the byte src[-1]. In the scenario, where src points to the end of + * a buffer being filled, the returned pointer point to either the + * final complete character in the string or to the earliest byte + * that might start an incomplete character waiting for more bytes to + * complete. + * + * Because this routine always returns a value < src until the point + * it is forced to return start, it is useful as a backward iterator + * through a string that will always make progress and always be + * prevented from running past the beginning of the string. + * + * In a string where all characters are complete and properly formed, + * and the value of src points to the first byte of a character, + * repeated Tcl_UtfPrev calls will step to the starting bytes of + * characters, one character at a time. Within those limitations, + * Tcl_UtfPrev and Tcl_UtfNext are inverses. If either condition cannot + * be met, Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and + * the caller will have to take greater care. * * Results: - * The return value is a pointer to the previous character in the UTF-8 - * string. If the current location was already at the beginning of the - * string, the return value will also be a pointer to the beginning of - * the string. + * A pointer to the start of a character in the string as described + * above. * * Side effects: * None. @@ -893,9 +932,8 @@ Tcl_UtfNext( const char * Tcl_UtfPrev( - const char *src, /* The current location in the string. */ - const char *start) /* Pointer to the beginning of the string, to - * avoid going backwards too far. */ + const char *src, /* A location in a UTF-8 string. */ + const char *start) /* Pointer to the beginning of the string */ { const char *look; int i, byte; @@ -913,6 +951,9 @@ Tcl_UtfPrev( break; } if (byte >= 0xC0) { + if (totalBytes[byte] <= i) { + break; + } return look; } look--; @@ -1975,7 +2016,7 @@ Tcl_UniCharIsSpace( */ if (ch < 0x80) { - return TclIsSpaceProc((char) ch); + return TclIsSpaceProcM((char) ch); } else if (UNICODE_OUT_OF_RANGE(ch)) { return 0; } else if (ch == 0x0085 || ch == 0x180E || ch == 0x200B |