summaryrefslogtreecommitdiffstats
path: root/tcl8.6/generic/tclUtf.c
diff options
context:
space:
mode:
authorWilliam Joye <wjoye@cfa.harvard.edu>2018-12-19 21:30:31 (GMT)
committerWilliam Joye <wjoye@cfa.harvard.edu>2018-12-19 21:30:31 (GMT)
commit21ae37c5a9db158c6c4d54a0fa75b8e0b526ebed (patch)
tree312b750afe3bdbe92b5727e2e552da8ed1751b7f /tcl8.6/generic/tclUtf.c
parent50f1824860159ebdb3896e14647a4562edf658d2 (diff)
parent0587fffc5409653d44546dbcebc7a497954cbb46 (diff)
downloadblt-21ae37c5a9db158c6c4d54a0fa75b8e0b526ebed.zip
blt-21ae37c5a9db158c6c4d54a0fa75b8e0b526ebed.tar.gz
blt-21ae37c5a9db158c6c4d54a0fa75b8e0b526ebed.tar.bz2
Merge branch 'devel'
Diffstat (limited to 'tcl8.6/generic/tclUtf.c')
-rw-r--r--tcl8.6/generic/tclUtf.c126
1 files changed, 83 insertions, 43 deletions
diff --git a/tcl8.6/generic/tclUtf.c b/tcl8.6/generic/tclUtf.c
index c60e99e..b33bf5f 100644
--- a/tcl8.6/generic/tclUtf.c
+++ b/tcl8.6/generic/tclUtf.c
@@ -154,19 +154,26 @@ Tcl_UniCharToUtf(
return 2;
}
if (ch <= 0xFFFF) {
-#if TCL_UTF_MAX == 4
+#if TCL_UTF_MAX > 3
if ((ch & 0xF800) == 0xD800) {
if (ch & 0x0400) {
/* Low surrogate */
- buf[3] = (char) ((ch | 0x80) & 0xBF);
- buf[2] |= (char) (((ch >> 6) | 0x80) & 0x8F);
- return 4;
+ if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80)
+ && ((buf[2] & 0xCF) == 0)) {
+ /* Previous Tcl_UniChar was a High surrogate, so combine */
+ buf[3] = (char) ((ch & 0x3F) | 0x80);
+ buf[2] |= (char) (((ch >> 6) & 0x0F) | 0x80);
+ return 4;
+ }
+ /* Previous Tcl_UniChar was not a High surrogate, so just output */
} else {
/* High surrogate */
ch += 0x40;
- buf[2] = (char) (((ch << 4) | 0x80) & 0xB0);
- buf[1] = (char) (((ch >> 2) | 0x80) & 0xBF);
- buf[0] = (char) (((ch >> 8) | 0xF0) & 0xF7);
+ /* Fill buffer with specific 3-byte (invalid) byte combination,
+ so following Low surrogate can recognize it and combine */
+ buf[2] = (char) ((ch << 4) & 0x30);
+ buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80);
+ buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0);
return 0;
}
}
@@ -182,6 +189,13 @@ Tcl_UniCharToUtf(
buf[0] = (char) ((ch >> 18) | 0xF0);
return 4;
}
+ } else if (ch == -1) {
+ if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80)
+ && ((buf[2] & 0xCF) == 0)) {
+ ch = 0xD7C0 + ((buf[0] & 0x07) << 8) + ((buf[1] & 0x3F) << 2)
+ + ((buf[2] & 0x30) >> 4);
+ goto three;
+ }
#endif
}
@@ -229,7 +243,7 @@ Tcl_UniCharToUtfDString(
*/
oldLength = Tcl_DStringLength(dsPtr);
- Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * TCL_UTF_MAX);
+ Tcl_DStringSetLength(dsPtr, oldLength + (uniLength + 1) * TCL_UTF_MAX);
string = Tcl_DStringValue(dsPtr) + oldLength;
p = string;
@@ -418,17 +432,27 @@ Tcl_UtfToUniCharDString(
*/
oldLength = Tcl_DStringLength(dsPtr);
-/* TODO: fix overreach! */
+
Tcl_DStringSetLength(dsPtr,
- (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
+ oldLength + (int) ((length + 1) * sizeof(Tcl_UniChar)));
wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
w = wString;
- end = src + length;
- for (p = src; p < end; ) {
+ p = src;
+ end = src + length - TCL_UTF_MAX;
+ while (p < end) {
p += TclUtfToUniChar(p, &ch);
*w++ = ch;
}
+ end += TCL_UTF_MAX;
+ while (p < end) {
+ if (Tcl_UtfCharComplete(p, end-p)) {
+ p += TclUtfToUniChar(p, &ch);
+ } else {
+ ch = UCHAR(*p++);
+ }
+ *w++ = ch;
+ }
*w = '\0';
Tcl_DStringSetLength(dsPtr,
(oldLength + ((char *) w - (char *) wString)));
@@ -726,8 +750,7 @@ Tcl_UniCharAtIndex(
{
Tcl_UniChar ch = 0;
- while (index >= 0) {
- index--;
+ while (index-- >= 0) {
src += TclUtfToUniChar(src, &ch);
}
return ch;
@@ -756,11 +779,18 @@ Tcl_UtfAtIndex(
register int index) /* The position of the desired character. */
{
Tcl_UniChar ch = 0;
+ int len = 1;
- while (index > 0) {
- index--;
+ while (index-- > 0) {
+ len = TclUtfToUniChar(src, &ch);
+ src += len;
+ }
+#if TCL_UTF_MAX == 4
+ if (!len) {
+ /* Index points at character following High Surrogate */
src += TclUtfToUniChar(src, &ch);
}
+#endif
return src;
}
@@ -971,7 +1001,11 @@ Tcl_UtfToTitle(
}
while (*src) {
bytes = TclUtfToUniChar(src, &ch);
- lowChar = Tcl_UniCharToLower(ch);
+ lowChar = ch;
+ /* Special exception for Georgian Asomtavruli chars, no titlecase. */
+ if ((unsigned)(lowChar - 0x1C90) >= 0x30) {
+ lowChar = Tcl_UniCharToLower(lowChar);
+ }
if (bytes < UtfCount(lowChar)) {
memcpy(dst, src, (size_t) bytes);
@@ -1072,16 +1106,17 @@ Tcl_UtfNcmp(
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
+ if (ch1 != ch2) {
#if TCL_UTF_MAX == 4
- /* map high surrogate characters to values > 0xffff */
- if ((ch1 & 0xFC00) == 0xD800) {
- ch1 += 0x4000;
- }
- if ((ch2 & 0xFC00) == 0xD800) {
- ch2 += 0x4000;
- }
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
#endif
- if (ch1 != ch2) {
return (ch1 - ch2);
}
}
@@ -1122,16 +1157,17 @@ Tcl_UtfNcasecmp(
*/
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
+ if (ch1 != ch2) {
#if TCL_UTF_MAX == 4
- /* map high surrogate characters to values > 0xffff */
- if ((ch1 & 0xFC00) == 0xD800) {
- ch1 += 0x4000;
- }
- if ((ch2 & 0xFC00) == 0xD800) {
- ch2 += 0x4000;
- }
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
#endif
- if (ch1 != ch2) {
ch1 = Tcl_UniCharToLower(ch1);
ch2 = Tcl_UniCharToLower(ch2);
if (ch1 != ch2) {
@@ -1170,16 +1206,17 @@ TclUtfCasecmp(
while (*cs && *ct) {
cs += TclUtfToUniChar(cs, &ch1);
ct += TclUtfToUniChar(ct, &ch2);
+ if (ch1 != ch2) {
#if TCL_UTF_MAX == 4
- /* map high surrogate characters to values > 0xffff */
- if ((ch1 & 0xFC00) == 0xD800) {
- ch1 += 0x4000;
- }
- if ((ch2 & 0xFC00) == 0xD800) {
- ch2 += 0x4000;
- }
+ /* Surrogates always report higher than non-surrogates */
+ if (((ch1 & 0xFC00) == 0xD800)) {
+ if ((ch2 & 0xFC00) != 0xD800) {
+ return ch1;
+ }
+ } else if ((ch2 & 0xFC00) == 0xD800) {
+ return -ch2;
+ }
#endif
- if (ch1 != ch2) {
ch1 = Tcl_UniCharToLower(ch1);
ch2 = Tcl_UniCharToLower(ch2);
if (ch1 != ch2) {
@@ -1240,8 +1277,9 @@ Tcl_UniCharToLower(
int ch) /* Unicode character to convert. */
{
int info = GetUniCharInfo(ch);
+ int mode = GetCaseType(info);
- if (GetCaseType(info) & 0x02) {
+ if ((mode & 0x02) && (mode != 0x7)) {
ch += GetDelta(info);
}
return (Tcl_UniChar) ch;
@@ -1275,7 +1313,9 @@ Tcl_UniCharToTitle(
* Subtract or add one depending on the original case.
*/
- ch += ((mode & 0x4) ? -1 : 1);
+ if (mode != 0x7) {
+ ch += ((mode & 0x4) ? -1 : 1);
+ }
} else if (mode == 0x4) {
ch -= GetDelta(info);
}