diff options
author | William Joye <wjoye@cfa.harvard.edu> | 2018-12-19 21:30:31 (GMT) |
---|---|---|
committer | William Joye <wjoye@cfa.harvard.edu> | 2018-12-19 21:30:31 (GMT) |
commit | 21ae37c5a9db158c6c4d54a0fa75b8e0b526ebed (patch) | |
tree | 312b750afe3bdbe92b5727e2e552da8ed1751b7f /tcl8.6/generic/tclUtf.c | |
parent | 50f1824860159ebdb3896e14647a4562edf658d2 (diff) | |
parent | 0587fffc5409653d44546dbcebc7a497954cbb46 (diff) | |
download | blt-21ae37c5a9db158c6c4d54a0fa75b8e0b526ebed.zip blt-21ae37c5a9db158c6c4d54a0fa75b8e0b526ebed.tar.gz blt-21ae37c5a9db158c6c4d54a0fa75b8e0b526ebed.tar.bz2 |
Merge branch 'devel'
Diffstat (limited to 'tcl8.6/generic/tclUtf.c')
-rw-r--r-- | tcl8.6/generic/tclUtf.c | 126 |
1 files changed, 83 insertions, 43 deletions
diff --git a/tcl8.6/generic/tclUtf.c b/tcl8.6/generic/tclUtf.c index c60e99e..b33bf5f 100644 --- a/tcl8.6/generic/tclUtf.c +++ b/tcl8.6/generic/tclUtf.c @@ -154,19 +154,26 @@ Tcl_UniCharToUtf( return 2; } if (ch <= 0xFFFF) { -#if TCL_UTF_MAX == 4 +#if TCL_UTF_MAX > 3 if ((ch & 0xF800) == 0xD800) { if (ch & 0x0400) { /* Low surrogate */ - buf[3] = (char) ((ch | 0x80) & 0xBF); - buf[2] |= (char) (((ch >> 6) | 0x80) & 0x8F); - return 4; + if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80) + && ((buf[2] & 0xCF) == 0)) { + /* Previous Tcl_UniChar was a High surrogate, so combine */ + buf[3] = (char) ((ch & 0x3F) | 0x80); + buf[2] |= (char) (((ch >> 6) & 0x0F) | 0x80); + return 4; + } + /* Previous Tcl_UniChar was not a High surrogate, so just output */ } else { /* High surrogate */ ch += 0x40; - buf[2] = (char) (((ch << 4) | 0x80) & 0xB0); - buf[1] = (char) (((ch >> 2) | 0x80) & 0xBF); - buf[0] = (char) (((ch >> 8) | 0xF0) & 0xF7); + /* Fill buffer with specific 3-byte (invalid) byte combination, + so following Low surrogate can recognize it and combine */ + buf[2] = (char) ((ch << 4) & 0x30); + buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80); + buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0); return 0; } } @@ -182,6 +189,13 @@ Tcl_UniCharToUtf( buf[0] = (char) ((ch >> 18) | 0xF0); return 4; } + } else if (ch == -1) { + if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80) + && ((buf[2] & 0xCF) == 0)) { + ch = 0xD7C0 + ((buf[0] & 0x07) << 8) + ((buf[1] & 0x3F) << 2) + + ((buf[2] & 0x30) >> 4); + goto three; + } #endif } @@ -229,7 +243,7 @@ Tcl_UniCharToUtfDString( */ oldLength = Tcl_DStringLength(dsPtr); - Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * TCL_UTF_MAX); + Tcl_DStringSetLength(dsPtr, oldLength + (uniLength + 1) * TCL_UTF_MAX); string = Tcl_DStringValue(dsPtr) + oldLength; p = string; @@ -418,17 +432,27 @@ Tcl_UtfToUniCharDString( */ oldLength = Tcl_DStringLength(dsPtr); -/* TODO: fix overreach! */ + Tcl_DStringSetLength(dsPtr, - (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar))); + oldLength + (int) ((length + 1) * sizeof(Tcl_UniChar))); wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); w = wString; - end = src + length; - for (p = src; p < end; ) { + p = src; + end = src + length - TCL_UTF_MAX; + while (p < end) { p += TclUtfToUniChar(p, &ch); *w++ = ch; } + end += TCL_UTF_MAX; + while (p < end) { + if (Tcl_UtfCharComplete(p, end-p)) { + p += TclUtfToUniChar(p, &ch); + } else { + ch = UCHAR(*p++); + } + *w++ = ch; + } *w = '\0'; Tcl_DStringSetLength(dsPtr, (oldLength + ((char *) w - (char *) wString))); @@ -726,8 +750,7 @@ Tcl_UniCharAtIndex( { Tcl_UniChar ch = 0; - while (index >= 0) { - index--; + while (index-- >= 0) { src += TclUtfToUniChar(src, &ch); } return ch; @@ -756,11 +779,18 @@ Tcl_UtfAtIndex( register int index) /* The position of the desired character. */ { Tcl_UniChar ch = 0; + int len = 1; - while (index > 0) { - index--; + while (index-- > 0) { + len = TclUtfToUniChar(src, &ch); + src += len; + } +#if TCL_UTF_MAX == 4 + if (!len) { + /* Index points at character following High Surrogate */ src += TclUtfToUniChar(src, &ch); } +#endif return src; } @@ -971,7 +1001,11 @@ Tcl_UtfToTitle( } while (*src) { bytes = TclUtfToUniChar(src, &ch); - lowChar = Tcl_UniCharToLower(ch); + lowChar = ch; + /* Special exception for Georgian Asomtavruli chars, no titlecase. */ + if ((unsigned)(lowChar - 0x1C90) >= 0x30) { + lowChar = Tcl_UniCharToLower(lowChar); + } if (bytes < UtfCount(lowChar)) { memcpy(dst, src, (size_t) bytes); @@ -1072,16 +1106,17 @@ Tcl_UtfNcmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); + if (ch1 != ch2) { #if TCL_UTF_MAX == 4 - /* map high surrogate characters to values > 0xffff */ - if ((ch1 & 0xFC00) == 0xD800) { - ch1 += 0x4000; - } - if ((ch2 & 0xFC00) == 0xD800) { - ch2 += 0x4000; - } + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } #endif - if (ch1 != ch2) { return (ch1 - ch2); } } @@ -1122,16 +1157,17 @@ Tcl_UtfNcasecmp( */ cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); + if (ch1 != ch2) { #if TCL_UTF_MAX == 4 - /* map high surrogate characters to values > 0xffff */ - if ((ch1 & 0xFC00) == 0xD800) { - ch1 += 0x4000; - } - if ((ch2 & 0xFC00) == 0xD800) { - ch2 += 0x4000; - } + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } #endif - if (ch1 != ch2) { ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); if (ch1 != ch2) { @@ -1170,16 +1206,17 @@ TclUtfCasecmp( while (*cs && *ct) { cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); + if (ch1 != ch2) { #if TCL_UTF_MAX == 4 - /* map high surrogate characters to values > 0xffff */ - if ((ch1 & 0xFC00) == 0xD800) { - ch1 += 0x4000; - } - if ((ch2 & 0xFC00) == 0xD800) { - ch2 += 0x4000; - } + /* Surrogates always report higher than non-surrogates */ + if (((ch1 & 0xFC00) == 0xD800)) { + if ((ch2 & 0xFC00) != 0xD800) { + return ch1; + } + } else if ((ch2 & 0xFC00) == 0xD800) { + return -ch2; + } #endif - if (ch1 != ch2) { ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); if (ch1 != ch2) { @@ -1240,8 +1277,9 @@ Tcl_UniCharToLower( int ch) /* Unicode character to convert. */ { int info = GetUniCharInfo(ch); + int mode = GetCaseType(info); - if (GetCaseType(info) & 0x02) { + if ((mode & 0x02) && (mode != 0x7)) { ch += GetDelta(info); } return (Tcl_UniChar) ch; @@ -1275,7 +1313,9 @@ Tcl_UniCharToTitle( * Subtract or add one depending on the original case. */ - ch += ((mode & 0x4) ? -1 : 1); + if (mode != 0x7) { + ch += ((mode & 0x4) ? -1 : 1); + } } else if (mode == 0x4) { ch -= GetDelta(info); } |