summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authordgp <dgp@users.sourceforge.net>2020-04-24 20:51:14 (GMT)
committerdgp <dgp@users.sourceforge.net>2020-04-24 20:51:14 (GMT)
commite41fff474338362ada285b42e9da856ba6502903 (patch)
tree53f915447a76f507afe28c74a3fe9d78a2069c4a
parent9b3252ab93bb1eda4a7f82664832fb03a04b41b9 (diff)
parentfbfa513c23b05ae5deeaa0ff81ce8045967890c0 (diff)
downloadtcl-e41fff474338362ada285b42e9da856ba6502903.zip
tcl-e41fff474338362ada285b42e9da856ba6502903.tar.gz
tcl-e41fff474338362ada285b42e9da856ba6502903.tar.bz2
Merge 8.5. Failing tests need examination and adjustment.
-rw-r--r--generic/tclCompExpr.c5
-rw-r--r--generic/tclUtf.c38
-rw-r--r--tests/utf.test6
3 files changed, 16 insertions, 33 deletions
diff --git a/generic/tclCompExpr.c b/generic/tclCompExpr.c
index ed4e958..4390282 100644
--- a/generic/tclCompExpr.c
+++ b/generic/tclCompExpr.c
@@ -1885,6 +1885,7 @@ ParseLexeme(
{
const char *end;
int scanned;
+ Tcl_UniChar ch;
Tcl_Obj *literal = NULL;
unsigned char byte;
@@ -2063,13 +2064,13 @@ ParseLexeme(
if (!TclIsBareword(*start) || *start == '_') {
if (Tcl_UtfCharComplete(start, numBytes)) {
- scanned = TclUtfNext(start) - start;
+ scanned = Tcl_UtfToUniChar(start, &ch);
} else {
char utfBytes[TCL_UTF_MAX];
memcpy(utfBytes, start, (size_t) numBytes);
utfBytes[numBytes] = '\0';
- scanned = TclUtfNext(utfBytes) - utfBytes;
+ scanned = Tcl_UtfToUniChar(utfBytes, &ch);
}
*lexemePtr = INVALID;
Tcl_DecrRefCount(literal);
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 96953e2..80a5a83 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -579,7 +579,7 @@ Tcl_NumUtfChars(
int length) /* The length of the string in bytes, or -1
* for strlen(string). */
{
- const char *next;
+ Tcl_UniChar ch;
register int i = 0;
/*
@@ -591,35 +591,20 @@ Tcl_NumUtfChars(
if (length < 0) {
while ((*src != '\0') && (i < INT_MAX)) {
- next = TclUtfNext(src);
-#if TCL_UTF_MAX > 4
+ src += TclUtfToUniChar(src, &ch);
i++;
-#else
- i += 1 + ((next - src) > 3);
-#endif
- src = next;
}
} else {
register const char *endPtr = src + length - TCL_UTF_MAX;
while (src < endPtr) {
- next = TclUtfNext(src);
-#if TCL_UTF_MAX > 4
+ src += TclUtfToUniChar(src, &ch);
i++;
-#else
- i += 1 + ((next - src) > 3);
-#endif
- src = next;
}
endPtr += TCL_UTF_MAX;
while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
- next = TclUtfNext(src);
-#if TCL_UTF_MAX > 4
+ src += TclUtfToUniChar(src, &ch);
i++;
-#else
- i += 1 + ((next - src) > 3);
-#endif
- src = next;
}
if (src < endPtr) {
i += endPtr - src;
@@ -946,19 +931,10 @@ Tcl_UtfAtIndex(
register const char *src, /* The UTF-8 string. */
register int index) /* The position of the desired character. */
{
- while (index-- > 0) {
- const char *next = TclUtfNext(src);
+ Tcl_UniChar ch;
-#if TCL_UTF_MAX <= 4
- /*
- * 4-byte sequences generate two UCS-2 code units in the
- * UTF-16 representation, so in the current indexing scheme
- * we need to account for an extra index (total of two).
- */
- index -= ((next - src) > 3);
-#endif
-
- src = next;
+ while (index-- > 0) {
+ src += TclUtfToUniChar(src, &ch);
}
return src;
}
diff --git a/tests/utf.test b/tests/utf.test
index fc0766d..acdd50e 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -470,6 +470,12 @@ test utf-6.92 {Tcl_UtfNext, pointing to 2th byte of 4-byte valid sequence} testu
test utf-6.93 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} {testutfnext ucs2} {
testutfnext -bytestring \x80\x80\x80
} 1
+test utf-6.125 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} testutfnext {
+ testutfnext \xA0\xA0\xA0\xA0
+} 1
+test utf-6.126 {Tcl_UtfNext, pointing to 2th byte of 5-byte invalid sequence} testutfnext {
+ testutfnext \x80\x80\x80\x80
+} 1
test utf-7.1 {Tcl_UtfPrev} testutfprev {
testutfprev {}