diff options
author | jan.nijtmans <nijtmans@users.sourceforge.net> | 2020-05-19 20:10:17 (GMT) |
---|---|---|
committer | jan.nijtmans <nijtmans@users.sourceforge.net> | 2020-05-19 20:10:17 (GMT) |
commit | 5f29a0196dbc94ae23df0ff6d5d9b5d1ffbd7d7f (patch) | |
tree | cd7bc07d09d710f57728a02c909f22f8f9a87c96 /generic | |
parent | e2d53f617b2bc55da830e4b7ba566d920873e83e (diff) | |
download | tk-5f29a0196dbc94ae23df0ff6d5d9b5d1ffbd7d7f.zip tk-5f29a0196dbc94ae23df0ff6d5d9b5d1ffbd7d7f.tar.gz tk-5f29a0196dbc94ae23df0ff6d5d9b5d1ffbd7d7f.tar.bz2 |
Little variation on bug-a179564826, in which Character indexing is kept, but with surrogate protection
Diffstat (limited to 'generic')
-rw-r--r-- | generic/tkInt.h | 3 | ||||
-rw-r--r-- | generic/tkUtil.c | 71 |
2 files changed, 6 insertions, 68 deletions
diff --git a/generic/tkInt.h b/generic/tkInt.h index a6304f8..c27bede 100644 --- a/generic/tkInt.h +++ b/generic/tkInt.h @@ -1287,19 +1287,18 @@ MODULE_SCOPE void TkUnixSetXftClipRegion(TkRegion clipRegion); # define c_class class #endif +#define TkNumUtfChars Tcl_NumUtfChars #if TCL_UTF_MAX > 4 # define TkUtfToUniChar Tcl_UtfToUniChar # define TkUniCharToUtf Tcl_UniCharToUtf # define TkUtfPrev Tcl_UtfPrev # define TkUtfAtIndex Tcl_UtfAtIndex -# define TkNumUtfChars Tcl_NumUtfChars # define TkUtfCharComplete Tcl_UtfCharComplete #else MODULE_SCOPE int TkUtfToUniChar(const char *, int *); MODULE_SCOPE int TkUniCharToUtf(int, char *); MODULE_SCOPE const char *TkUtfPrev(const char *, const char *); MODULE_SCOPE const char *TkUtfAtIndex(const char *src, int index); - MODULE_SCOPE int TkNumUtfChars(const char *src, int length); # define TkUtfCharComplete(src, length) (((unsigned)(UCHAR(*(src)) - 0xF0) < 5) \ ? ((length) >= 4) : (UCHAR(*(src)) == 0xED) ? ((length) >= 6) : Tcl_UtfCharComplete((src), (length))) #endif diff --git a/generic/tkUtil.c b/generic/tkUtil.c index e055b0d..172bf23 100644 --- a/generic/tkUtil.c +++ b/generic/tkUtil.c @@ -1308,8 +1308,7 @@ TkUtfPrev( * TkUtfAtIndex -- * * Returns a pointer to the specified character (not byte) position in - * a CESU-8 string. That is, a pair of CESU-8 encoded surrogates counts - * as a single character. + * a CESU-8 string. This will never point at a low surrogate. * * Results: * As above. @@ -1325,72 +1324,12 @@ TkUtfAtIndex( const char *src, /* The UTF-8 string. */ int index) /* The position of the desired character. */ { - int len = 0; int ch; - - while (index-- > 0) { - len = TkUtfToUniChar(src, &ch); - src += len; + const char *p = Tcl_UtfAtIndex(src, index); + if ((p > src) && (UCHAR(p[-1]) > 0xF0)) { + return p + TkUtfToUniChar(p - 1, &ch); } - return src; -} - -/* - *--------------------------------------------------------------------------- - * - * TkNumUtfChars -- - * - * Returns the number of characters (not bytes) in the UTF-8 string, not - * including the terminating NULL byte. This differs from Tcl_NumUtfChars - * in that a pair of CESU-8 encoded surrogates counts as one unicode - * character. - * - * Results: - * As above. - * - * Side effects: - * None. - * - *--------------------------------------------------------------------------- - */ - -int -TkNumUtfChars( - const char *src, /* The UTF-8 string to measure. */ - int length) /* The length of the string in bytes, or -1 - * for strlen(string). */ -{ - int ch; - int i = 0; - Tcl_UniChar ch2 = 0; - - if (length < 0) { - /* string is NUL-terminated, so TclUtfToUniChar calls are safe. */ - while ((*src != '\0') && (i < INT_MAX)) { - src += TkUtfToUniChar(src, &ch); - i++; - } - } else { - /* No need to call TkUtfCharComplete() up to endPtr */ - const char *endPtr = src + length - 6; - while (src < endPtr) { - src += TkUtfToUniChar(src, &ch); - i++; - } - /* Pointer to the end of string. Never read endPtr[0] */ - endPtr += 6; - while (src < endPtr) { - if (TkUtfCharComplete(src, endPtr - src)) { - src += TkUtfToUniChar(src, &ch); - } else if (Tcl_UtfCharComplete(src, endPtr - src)) { - src += Tcl_UtfToUniChar(src, &ch2); - } else { - src++; - } - i++; - } - } - return i; + return p; } #endif |