summaryrefslogtreecommitdiffstats
path: root/generic/tclUtf.c
diff options
context:
space:
mode:
authorkjnash <k.j.nash@usa.net>2022-08-31 15:24:20 (GMT)
committerkjnash <k.j.nash@usa.net>2022-08-31 15:24:20 (GMT)
commit7443a97bd1d5060c2bc3ea57dbd1899ea2efb9b8 (patch)
treea7402019faf3e75458552fe9dde90324f981fe7b /generic/tclUtf.c
parent19f8c3bb6b2aa8d571a7534b588ddacfb49952d3 (diff)
parent52b58d0c7d1575d7c784ccb344862e0de8a9686b (diff)
downloadtcl-7443a97bd1d5060c2bc3ea57dbd1899ea2efb9b8.zip
tcl-7443a97bd1d5060c2bc3ea57dbd1899ea2efb9b8.tar.gz
tcl-7443a97bd1d5060c2bc3ea57dbd1899ea2efb9b8.tar.bz2
Merge old 8.7 6c69a72c58
Diffstat (limited to 'generic/tclUtf.c')
-rw-r--r--generic/tclUtf.c71
1 files changed, 36 insertions, 35 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 525cd50..28f725a 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -3,7 +3,7 @@
*
* Routines for manipulating UTF-8 strings.
*
- * Copyright (c) 1997-1998 Sun Microsystems, Inc.
+ * Copyright © 1997-1998 Sun Microsystems, Inc.
*
* See the file "license.terms" for information on usage and redistribution of
* this file, and for a DISCLAIMER OF ALL WARRANTIES.
@@ -64,20 +64,12 @@ static const unsigned char totalBytes[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-/* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-/* End of "continuation byte section" */
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-#if TCL_UTF_MAX > 3
- 4,4,4,4,4,
-#else
- 1,1,1,1,1,
-#endif
- 1,1,1,1,1,1,1,1,1,1,1
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1
};
-
+
static const unsigned char complete[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
@@ -88,15 +80,9 @@ static const unsigned char complete[256] = {
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
/* End of "continuation byte section" */
2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-#if TCL_UTF_MAX > 3
- 4,4,4,4,4,
-#else
- 3,3,3,3,3,
-#endif
- 1,1,1,1,1,1,1,1,1,1,1
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1
};
-
+
/*
* Functions used only in this module.
*/
@@ -694,7 +680,7 @@ Tcl_UtfToUniCharDString(
p += TclUtfToUCS4(p, &ch);
*w++ = ch;
}
- while ((p < endPtr) && TclUCS4Complete(p, endPtr-p)) {
+ while ((p < endPtr) && Tcl_UtfCharComplete(p, endPtr-p)) {
p += TclUtfToUCS4(p, &ch);
*w++ = ch;
}
@@ -752,7 +738,7 @@ Tcl_UtfToChar16DString(
*w++ = ch;
}
while (p < endPtr) {
- if (TclChar16Complete(p, endPtr-p)) {
+ if (Tcl_UtfCharComplete(p, endPtr-p)) {
p += Tcl_UtfToChar16(p, &ch);
*w++ = ch;
} else {
@@ -833,7 +819,7 @@ Tcl_NumUtfChars(
/* Pointer to the end of string. Never read endPtr[0] */
const char *endPtr = src + length;
/* Pointer to last byte where optimization still can be used */
- const char *optPtr = endPtr - TCL_UTF_MAX;
+ const char *optPtr = endPtr - 4;
/*
* Optimize away the call in this loop. Justified because...
@@ -970,6 +956,10 @@ Tcl_UtfNext(
const char *next;
if (((*src) & 0xC0) == 0x80) {
+ /* Continuation byte, so we start 'inside' a (possible valid) UTF-8
+ * sequence. Since we are not allowed to access src[-1], we cannot
+ * check if the sequence is actually valid, the best we can do is
+ * just assume it is valid and locate the end. */
if ((((*++src) & 0xC0) == 0x80) && (((*++src) & 0xC0) == 0x80)) {
++src;
}
@@ -1064,7 +1054,7 @@ Tcl_UtfPrev(
* it (the fallback) is correct.
*/
- || (trailBytesSeen >= complete[byte])) {
+ || (trailBytesSeen >= totalBytes[byte])) {
/*
* That is, (1 + trailBytesSeen > needed).
* We've examined more bytes than needed to complete
@@ -1105,19 +1095,14 @@ Tcl_UtfPrev(
/* Continue the search backwards... */
look--;
- } while (trailBytesSeen < TCL_UTF_MAX);
+ } while (trailBytesSeen < 4);
/*
- * We've seen TCL_UTF_MAX trail bytes, so we know there will not be a
+ * We've seen 4 trail bytes, so we know there will not be a
* properly formed byte sequence to find, and we can stop looking,
- * accepting the fallback (for TCL_UTF_MAX > 3) or just go back as
- * far as we can.
+ * accepting the fallback.
*/
-#if TCL_UTF_MAX > 3
return fallback;
-#else
- return src - TCL_UTF_MAX;
-#endif
}
/*
@@ -1744,7 +1729,7 @@ Tcl_UniCharToLower(
/* Clear away extension bits, if any */
return ch & 0x1FFFFF;
}
-
+
/*
*----------------------------------------------------------------------
*
@@ -1838,7 +1823,7 @@ Tcl_UniCharNcmp(
const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */
unsigned long numChars) /* Number of unichars to compare. */
{
-#ifdef WORDS_BIGENDIAN
+#if defined(WORDS_BIGENDIAN) && (TCL_UTF_MAX > 3)
/*
* We are definitely on a big-endian machine; memcmp() is safe
*/
@@ -1852,6 +1837,14 @@ Tcl_UniCharNcmp(
for ( ; numChars != 0; ucs++, uct++, numChars--) {
if (*ucs != *uct) {
+#if TCL_UTF_MAX < 4
+ /* special case for handling upper surrogates */
+ if (((*ucs & 0xFC00) == 0xD800) && ((*uct & 0xFC00) != 0xD800)) {
+ return 1;
+ } else if (((*uct & 0xFC00) == 0xD800)) {
+ return -1;
+ }
+#endif
return (*ucs - *uct);
}
}
@@ -1889,6 +1882,14 @@ Tcl_UniCharNcasecmp(
Tcl_UniChar lct = Tcl_UniCharToLower(*uct);
if (lcs != lct) {
+#if TCL_UTF_MAX < 4
+ /* special case for handling upper surrogates */
+ if (((lcs & 0xFC00) == 0xD800) && ((lct & 0xFC00) != 0xD800)) {
+ return 1;
+ } else if (((lct & 0xFC00) == 0xD800)) {
+ return -1;
+ }
+#endif
return (lcs - lct);
}
}