summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authordgp <dgp@users.sourceforge.net>2020-04-27 12:29:47 (GMT)
committerdgp <dgp@users.sourceforge.net>2020-04-27 12:29:47 (GMT)
commit0650564b5d84ab359c0aa60685e55ac76e57cfac (patch)
tree664188f75274ddd90dbf326d70570c80ea86670c
parent7a2e5e227c82ec66f8e53328ed4fd4e1e5b923e8 (diff)
parent60d1d8c7eb1ac57639a5666836625c845fe38f2d (diff)
downloadtcl-0650564b5d84ab359c0aa60685e55ac76e57cfac.zip
tcl-0650564b5d84ab359c0aa60685e55ac76e57cfac.tar.gz
tcl-0650564b5d84ab359c0aa60685e55ac76e57cfac.tar.bz2
[45ca2338cd] Revise the [string to*] machinery for custom builds.
-rw-r--r--generic/tclUtf.c72
-rw-r--r--tests/utf.test22
2 files changed, 68 insertions, 26 deletions
diff --git a/generic/tclUtf.c b/generic/tclUtf.c
index 665607f..0e9561d 100644
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -87,6 +87,9 @@ static const unsigned char totalBytes[256] = {
static int UtfCount(int ch);
static int Invalid(unsigned char *src);
+static int UCS4ToUpper(int ch);
+static int UCS4ToLower(int ch);
+static int UCS4ToTitle(int ch);
/*
*---------------------------------------------------------------------------
@@ -1007,7 +1010,7 @@ int
Tcl_UtfToUpper(
char *str) /* String to convert in place. */
{
- Tcl_UniChar ch = 0, upChar;
+ int ch, upChar;
char *src, *dst;
int len;
@@ -1017,8 +1020,8 @@ Tcl_UtfToUpper(
src = dst = str;
while (*src) {
- len = TclUtfToUniChar(src, &ch);
- upChar = Tcl_UniCharToUpper(ch);
+ len = TclUtfToUCS4(src, &ch);
+ upChar = UCS4ToUpper(ch);
/*
* To keep badly formed Utf strings from getting inflated by the
@@ -1026,7 +1029,7 @@ Tcl_UtfToUpper(
* char to dst if its size is <= the original char.
*/
- if (len < UtfCount(upChar)) {
+ if (len < UtfCount(upChar) || ((upChar & 0xF800) == 0xD800)) {
memmove(dst, src, len);
dst += len;
} else {
@@ -1060,7 +1063,7 @@ int
Tcl_UtfToLower(
char *str) /* String to convert in place. */
{
- Tcl_UniChar ch = 0, lowChar;
+ int ch, lowChar;
char *src, *dst;
int len;
@@ -1070,8 +1073,8 @@ Tcl_UtfToLower(
src = dst = str;
while (*src) {
- len = TclUtfToUniChar(src, &ch);
- lowChar = Tcl_UniCharToLower(ch);
+ len = TclUtfToUCS4(src, &ch);
+ lowChar = UCS4ToLower(ch);
/*
* To keep badly formed Utf strings from getting inflated by the
@@ -1079,7 +1082,7 @@ Tcl_UtfToLower(
* char to dst if its size is <= the original char.
*/
- if (len < UtfCount(lowChar)) {
+ if (len < UtfCount(lowChar) || ((lowChar & 0xF800) == 0xD800)) {
memmove(dst, src, len);
dst += len;
} else {
@@ -1114,7 +1117,7 @@ int
Tcl_UtfToTitle(
char *str) /* String to convert in place. */
{
- Tcl_UniChar ch = 0, titleChar, lowChar;
+ int ch, titleChar, lowChar;
char *src, *dst;
int len;
@@ -1126,10 +1129,10 @@ Tcl_UtfToTitle(
src = dst = str;
if (*src) {
- len = TclUtfToUniChar(src, &ch);
- titleChar = Tcl_UniCharToTitle(ch);
+ len = TclUtfToUCS4(src, &ch);
+ titleChar = UCS4ToTitle(ch);
- if (len < UtfCount(titleChar)) {
+ if (len < UtfCount(titleChar) || ((titleChar & 0xF800) == 0xD800)) {
memmove(dst, src, len);
dst += len;
} else {
@@ -1138,14 +1141,14 @@ Tcl_UtfToTitle(
src += len;
}
while (*src) {
- len = TclUtfToUniChar(src, &ch);
+ len = TclUtfToUCS4(src, &ch);
lowChar = ch;
/* Special exception for Georgian Asomtavruli chars, no titlecase. */
if ((unsigned)(lowChar - 0x1C90) >= 0x30) {
- lowChar = Tcl_UniCharToLower(lowChar);
+ lowChar = UCS4ToLower(lowChar);
}
- if (len < UtfCount(lowChar)) {
+ if (len < UtfCount(lowChar) || ((lowChar & 0xF800) == 0xD800)) {
memmove(dst, src, len);
dst += len;
} else {
@@ -1382,8 +1385,8 @@ TclUtfCasecmp(
*----------------------------------------------------------------------
*/
-Tcl_UniChar
-Tcl_UniCharToUpper(
+static int
+UCS4ToUpper(
int ch) /* Unicode character to convert. */
{
int info = GetUniCharInfo(ch);
@@ -1391,7 +1394,14 @@ Tcl_UniCharToUpper(
if (GetCaseType(info) & 0x04) {
ch -= GetDelta(info);
}
- return (Tcl_UniChar) ch;
+ return ch;
+}
+
+Tcl_UniChar
+Tcl_UniCharToUpper(
+ int ch) /* Unicode character to convert. */
+{
+ return (Tcl_UniChar) UCS4ToUpper(ch);
}
/*
@@ -1410,8 +1420,8 @@ Tcl_UniCharToUpper(
*----------------------------------------------------------------------
*/
-Tcl_UniChar
-Tcl_UniCharToLower(
+static int
+UCS4ToLower(
int ch) /* Unicode character to convert. */
{
int info = GetUniCharInfo(ch);
@@ -1420,7 +1430,14 @@ Tcl_UniCharToLower(
if ((mode & 0x02) && (mode != 0x7)) {
ch += GetDelta(info);
}
- return (Tcl_UniChar) ch;
+ return ch;
+}
+
+Tcl_UniChar
+Tcl_UniCharToLower(
+ int ch) /* Unicode character to convert. */
+{
+ return (Tcl_UniChar) UCS4ToLower(ch);
}
/*
@@ -1439,8 +1456,8 @@ Tcl_UniCharToLower(
*----------------------------------------------------------------------
*/
-Tcl_UniChar
-Tcl_UniCharToTitle(
+static int
+UCS4ToTitle(
int ch) /* Unicode character to convert. */
{
int info = GetUniCharInfo(ch);
@@ -1457,7 +1474,14 @@ Tcl_UniCharToTitle(
} else if (mode == 0x4) {
ch -= GetDelta(info);
}
- return (Tcl_UniChar) ch;
+ return ch;
+}
+
+Tcl_UniChar
+Tcl_UniCharToTitle(
+ int ch) /* Unicode character to convert. */
+{
+ return (Tcl_UniChar) UCS4ToTitle(ch);
}
/*
diff --git a/tests/utf.test b/tests/utf.test
index cf0d1bf..6fed971 100644
--- a/tests/utf.test
+++ b/tests/utf.test
@@ -1111,6 +1111,12 @@ test utf-11.4 {Tcl_UtfToUpper} {
test utf-11.5 {Tcl_UtfToUpper Georgian (new in Unicode 11)} {
string toupper \u10D0\u1C90
} \u1C90\u1C90
+test utf-11.6 {Tcl_UtfToUpper beyond U+FFFF} {Uesc fullutf} {
+ string toupper \U10428
+} \U10400
+test utf-11.7 {Tcl_UtfToUpper beyond U+FFFF} {pairsTo4bytes} {
+ string toupper \uD801\uDC28
+} \uD801\uDC00
test utf-12.1 {Tcl_UtfToLower} {
string tolower {}
@@ -1127,9 +1133,15 @@ test utf-12.4 {Tcl_UtfToLower} {
test utf-12.5 {Tcl_UtfToLower Georgian (new in Unicode 11)} {
string tolower \u10D0\u1C90
} \u10D0\u10D0
-test utf-12.6 {Tcl_UtfToUpper low/high surrogate)} ucs2 {
+test utf-12.6 {Tcl_UtfToLower low/high surrogate)} {
string tolower \uDC24\uD824
} \uDC24\uD824
+test utf-12.7 {Tcl_UtfToLower beyond U+FFFF} {Uesc fullutf} {
+ string tolower \U10400
+} \U10428
+test utf-12.8 {Tcl_UtfToLower beyond U+FFFF} {pairsTo4bytes} {
+ string tolower \uD801\uDC00
+} \uD801\uDC28
test utf-13.1 {Tcl_UtfToTitle} {
string totitle {}
@@ -1149,9 +1161,15 @@ test utf-13.5 {Tcl_UtfToTitle Georgian (new in Unicode 11)} {
test utf-13.6 {Tcl_UtfToTitle Georgian (new in Unicode 11)} {
string totitle \u1C90\u10D0
} \u1C90\u10D0
-test utf-13.7 {Tcl_UtfToTitle low/high surrogate)} ucs2 {
+test utf-13.7 {Tcl_UtfToTitle low/high surrogate)} {
string totitle \uDC24\uD824
} \uDC24\uD824
+test utf-13.8 {Tcl_UtfToTitle beyond U+FFFF} {Uesc fullutf} {
+ string totitle \U10428
+} \U10400
+test utf-13.9 {Tcl_UtfToTitle beyond U+FFFF} {pairsTo4bytes} {
+ string totitle \uD801\uDC28
+} \uD801\uDC00
test utf-14.1 {Tcl_UtfNcasecmp} {
string compare -nocase a b