summaryrefslogtreecommitdiffstats
path: root/generic
diff options
context:
space:
mode:
authorjan.nijtmans <nijtmans@users.sourceforge.net>2020-05-19 20:10:17 (GMT)
committerjan.nijtmans <nijtmans@users.sourceforge.net>2020-05-19 20:10:17 (GMT)
commit5f29a0196dbc94ae23df0ff6d5d9b5d1ffbd7d7f (patch)
treecd7bc07d09d710f57728a02c909f22f8f9a87c96 /generic
parente2d53f617b2bc55da830e4b7ba566d920873e83e (diff)
downloadtk-5f29a0196dbc94ae23df0ff6d5d9b5d1ffbd7d7f.zip
tk-5f29a0196dbc94ae23df0ff6d5d9b5d1ffbd7d7f.tar.gz
tk-5f29a0196dbc94ae23df0ff6d5d9b5d1ffbd7d7f.tar.bz2
Little variation on bug-a179564826, in which Character indexing is kept, but with surrogate protection
Diffstat (limited to 'generic')
-rw-r--r--generic/tkInt.h3
-rw-r--r--generic/tkUtil.c71
2 files changed, 6 insertions, 68 deletions
diff --git a/generic/tkInt.h b/generic/tkInt.h
index a6304f8..c27bede 100644
--- a/generic/tkInt.h
+++ b/generic/tkInt.h
@@ -1287,19 +1287,18 @@ MODULE_SCOPE void TkUnixSetXftClipRegion(TkRegion clipRegion);
# define c_class class
#endif
+#define TkNumUtfChars Tcl_NumUtfChars
#if TCL_UTF_MAX > 4
# define TkUtfToUniChar Tcl_UtfToUniChar
# define TkUniCharToUtf Tcl_UniCharToUtf
# define TkUtfPrev Tcl_UtfPrev
# define TkUtfAtIndex Tcl_UtfAtIndex
-# define TkNumUtfChars Tcl_NumUtfChars
# define TkUtfCharComplete Tcl_UtfCharComplete
#else
MODULE_SCOPE int TkUtfToUniChar(const char *, int *);
MODULE_SCOPE int TkUniCharToUtf(int, char *);
MODULE_SCOPE const char *TkUtfPrev(const char *, const char *);
MODULE_SCOPE const char *TkUtfAtIndex(const char *src, int index);
- MODULE_SCOPE int TkNumUtfChars(const char *src, int length);
# define TkUtfCharComplete(src, length) (((unsigned)(UCHAR(*(src)) - 0xF0) < 5) \
? ((length) >= 4) : (UCHAR(*(src)) == 0xED) ? ((length) >= 6) : Tcl_UtfCharComplete((src), (length)))
#endif
diff --git a/generic/tkUtil.c b/generic/tkUtil.c
index e055b0d..172bf23 100644
--- a/generic/tkUtil.c
+++ b/generic/tkUtil.c
@@ -1308,8 +1308,7 @@ TkUtfPrev(
* TkUtfAtIndex --
*
* Returns a pointer to the specified character (not byte) position in
- * a CESU-8 string. That is, a pair of CESU-8 encoded surrogates counts
- * as a single character.
+ * a CESU-8 string. This will never point at a low surrogate.
*
* Results:
* As above.
@@ -1325,72 +1324,12 @@ TkUtfAtIndex(
const char *src, /* The UTF-8 string. */
int index) /* The position of the desired character. */
{
- int len = 0;
int ch;
-
- while (index-- > 0) {
- len = TkUtfToUniChar(src, &ch);
- src += len;
+ const char *p = Tcl_UtfAtIndex(src, index);
+ if ((p > src) && (UCHAR(p[-1]) > 0xF0)) {
+ return p + TkUtfToUniChar(p - 1, &ch);
}
- return src;
-}
-
-/*
- *---------------------------------------------------------------------------
- *
- * TkNumUtfChars --
- *
- * Returns the number of characters (not bytes) in the UTF-8 string, not
- * including the terminating NULL byte. This differs from Tcl_NumUtfChars
- * in that a pair of CESU-8 encoded surrogates counts as one unicode
- * character.
- *
- * Results:
- * As above.
- *
- * Side effects:
- * None.
- *
- *---------------------------------------------------------------------------
- */
-
-int
-TkNumUtfChars(
- const char *src, /* The UTF-8 string to measure. */
- int length) /* The length of the string in bytes, or -1
- * for strlen(string). */
-{
- int ch;
- int i = 0;
- Tcl_UniChar ch2 = 0;
-
- if (length < 0) {
- /* string is NUL-terminated, so TclUtfToUniChar calls are safe. */
- while ((*src != '\0') && (i < INT_MAX)) {
- src += TkUtfToUniChar(src, &ch);
- i++;
- }
- } else {
- /* No need to call TkUtfCharComplete() up to endPtr */
- const char *endPtr = src + length - 6;
- while (src < endPtr) {
- src += TkUtfToUniChar(src, &ch);
- i++;
- }
- /* Pointer to the end of string. Never read endPtr[0] */
- endPtr += 6;
- while (src < endPtr) {
- if (TkUtfCharComplete(src, endPtr - src)) {
- src += TkUtfToUniChar(src, &ch);
- } else if (Tcl_UtfCharComplete(src, endPtr - src)) {
- src += Tcl_UtfToUniChar(src, &ch2);
- } else {
- src++;
- }
- i++;
- }
- }
- return i;
+ return p;
}
#endif