From 6860390243ff910bf7175af41567d588c930a45d Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 8 Apr 2020 07:27:50 +0000 Subject: Teach "split", "string wordstart" and "string wordend" how to handle Unicode characters >\uFFFF, using the new utility function TclUtfToUCS4(). (Thanks, Don!). Add test-cases to prove it. --- generic/tclCmdMZ.c | 29 ++++++++++++----------------- tests/split.test | 7 +++++-- tests/string.test | 12 ++++++++++++ 3 files changed, 29 insertions(+), 19 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 3e4f1d5..1ddae0f 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -1174,7 +1174,7 @@ Tcl_SplitObjCmd( int objc, /* Number of arguments. */ Tcl_Obj *const objv[]) /* Argument objects. */ { - Tcl_UniChar ch = 0; + int ch = 0; int len; const char *splitChars; const char *stringPtr; @@ -1217,10 +1217,8 @@ Tcl_SplitObjCmd( Tcl_InitHashTable(&charReuseTable, TCL_ONE_WORD_KEYS); for ( ; stringPtr < end; stringPtr += len) { - int ucs4; - - len = TclUtfToUCS4(stringPtr, &ucs4); - hPtr = Tcl_CreateHashEntry(&charReuseTable, INT2PTR(ucs4), &isNew); + len = TclUtfToUCS4(stringPtr, &ch); + hPtr = Tcl_CreateHashEntry(&charReuseTable, INT2PTR(ch), &isNew); if (isNew) { TclNewStringObj(objPtr, stringPtr, len); @@ -1255,7 +1253,7 @@ Tcl_SplitObjCmd( } else { const char *element, *p, *splitEnd; int splitLen; - Tcl_UniChar splitChar = 0; + int splitChar; /* * Normal case: split on any of a given set of characters. Discard @@ -1265,9 +1263,9 @@ Tcl_SplitObjCmd( splitEnd = splitChars + splitCharLen; for (element = stringPtr; stringPtr < end; stringPtr += len) { - len = TclUtfToUniChar(stringPtr, &ch); + len = TclUtfToUCS4(stringPtr, &ch); for (p = splitChars; p < splitEnd; p += splitLen) { - splitLen = TclUtfToUniChar(p, &splitChar); + splitLen = TclUtfToUCS4(p, &splitChar); if (ch == splitChar) { TclNewStringObj(objPtr, element, stringPtr - element); Tcl_ListObjAppendElement(NULL, listPtr, objPtr); @@ -2482,9 +2480,7 @@ StringRevCmd( * StringStartCmd -- * * This procedure is invoked to process the "string wordstart" Tcl - * command. See the user documentation for details on what it does. Note - * that this command only functions correctly on properly formed Tcl UTF - * strings. + * command. See the user documentation for details on what it does. * * Results: * A standard Tcl result. @@ -2502,7 +2498,7 @@ StringStartCmd( int objc, /* Number of arguments. */ Tcl_Obj *const objv[]) /* Argument objects. */ { - Tcl_UniChar ch = 0; + int ch; const char *p, *string; int cur, index, length, numChars; @@ -2524,7 +2520,7 @@ StringStartCmd( if (index > 0) { p = Tcl_UtfAtIndex(string, index); for (cur = index; cur >= 0; cur--) { - TclUtfToUniChar(p, &ch); + TclUtfToUCS4(p, &ch); if (!Tcl_UniCharIsWordChar(ch)) { break; } @@ -2544,8 +2540,7 @@ StringStartCmd( * StringEndCmd -- * * This procedure is invoked to process the "string wordend" Tcl command. - * See the user documentation for details on what it does. Note that this - * command only functions correctly on properly formed Tcl UTF strings. + * See the user documentation for details on what it does. * * Results: * A standard Tcl result. @@ -2563,7 +2558,7 @@ StringEndCmd( int objc, /* Number of arguments. */ Tcl_Obj *const objv[]) /* Argument objects. */ { - Tcl_UniChar ch = 0; + int ch; const char *p, *end, *string; int cur, index, length, numChars; @@ -2585,7 +2580,7 @@ StringEndCmd( p = Tcl_UtfAtIndex(string, index); end = string+length; for (cur = index; p < end; cur++) { - p += TclUtfToUniChar(p, &ch); + p += TclUtfToUCS4(p, &ch); if (!Tcl_UniCharIsWordChar(ch)) { break; } diff --git a/tests/split.test b/tests/split.test index 2d180e0..d00c452 100644 --- a/tests/split.test +++ b/tests/split.test @@ -71,8 +71,11 @@ test split-1.14 {basic split commands} { split ",12,,,34,56," {,} } {{} 12 {} {} 34 56 {}} test split-1.15 {basic split commands} -body { - split "a\U01f4a9b" {} -} -result "a \U01f4a9 b" + split "a\U1F4A9b" {} +} -result "a \U1F4A9 b" +test split-1.16 {basic split commands} -body { + split "a\U1F4A9b" \U1F4A9 +} -result "a b" test split-2.1 {split errors} { list [catch split msg] $msg $errorCode diff --git a/tests/string.test b/tests/string.test index 7117d4d..d358fce 100644 --- a/tests/string.test +++ b/tests/string.test @@ -1856,6 +1856,12 @@ test string-21.13.$noComp {string wordend, unicode} -body { test string-21.14.$noComp {string wordend, unicode} -body { run {string wordend "\uC700\uC700 abc" 8} } -result 6 +test string-21.15.$noComp {string wordend, unicode} -body { + run {string wordend "\U1D7CA\U1D7CA abc" 0} +} -result 2 +test string-21.16.$noComp {string wordend, unicode} -body { + run {string wordend "\U1D7CA\U1D7CA abc" 10} +} -result 8 test string-22.1.$noComp {string wordstart} -body { list [catch {run {string word a}} msg] $msg @@ -1896,6 +1902,12 @@ test string-22.12.$noComp {string wordstart, unicode} -body { test string-22.13.$noComp {string wordstart, unicode} -body { run {string wordstart "\uC700\uC700 abc" 8} } -result 3 +test string-22.14.$noComp {string wordstart, unicode} -body { + run {string wordstart "\U1D7CA\U1D7CA abc" 0} +} -result 0 +test string-22.15.$noComp {string wordstart, unicode} -body { + run {string wordstart "\U1D7CA\U1D7CA abc" 10} +} -result 5 test string-23.0.$noComp {string is boolean, Bug 1187123} testindexobj { set x 5 -- cgit v0.12 From c7de8a1d22c60626b19b0bb57ac58fa7d9d5cc16 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Wed, 8 Apr 2020 07:57:03 +0000 Subject: In TCL_UTF_MAX=4 mode, string lengths are quitely different. --- tests/string.test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/string.test b/tests/string.test index d358fce..aca3570 100644 --- a/tests/string.test +++ b/tests/string.test @@ -1859,7 +1859,7 @@ test string-21.14.$noComp {string wordend, unicode} -body { test string-21.15.$noComp {string wordend, unicode} -body { run {string wordend "\U1D7CA\U1D7CA abc" 0} } -result 2 -test string-21.16.$noComp {string wordend, unicode} -body { +test string-21.16.$noComp {string wordend, unicode} -constraints tip389 -body { run {string wordend "\U1D7CA\U1D7CA abc" 10} } -result 8 @@ -1905,7 +1905,7 @@ test string-22.13.$noComp {string wordstart, unicode} -body { test string-22.14.$noComp {string wordstart, unicode} -body { run {string wordstart "\U1D7CA\U1D7CA abc" 0} } -result 0 -test string-22.15.$noComp {string wordstart, unicode} -body { +test string-22.15.$noComp {string wordstart, unicode} -constraints tip389 -body { run {string wordstart "\U1D7CA\U1D7CA abc" 10} } -result 5 -- cgit v0.12