From 1ffeaf24efc557cfbfd2120f1871cf015e466cf2 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 10 Nov 2017 08:40:45 +0000 Subject: Make "string split" and "string is (alpha|graph|...)" work as expected with Unicode chars > U+ffff, when Tcl is compiled with TCL_UTF_MAX == 4. No effect when TCL_UTF_MAX == 3 or TCL_UTF_MAX == 6. Test-case added for "string split". --- generic/tclCmdMZ.c | 23 ++++++++++++++++++++--- tests/split.test | 3 +++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/generic/tclCmdMZ.c b/generic/tclCmdMZ.c index 7010495..d63a985 100644 --- a/generic/tclCmdMZ.c +++ b/generic/tclCmdMZ.c @@ -309,7 +309,7 @@ Tcl_RegexpObjCmd( eflags = 0; } else if (offset > stringLength) { eflags = TCL_REG_NOTBOL; - } else if (Tcl_GetUniChar(objPtr, offset-1) == (Tcl_UniChar)'\n') { + } else if (Tcl_GetUniChar(objPtr, offset-1) == '\n') { eflags = 0; } else { eflags = TCL_REG_NOTBOL; @@ -1080,13 +1080,22 @@ Tcl_SplitObjCmd( Tcl_InitHashTable(&charReuseTable, TCL_ONE_WORD_KEYS); for ( ; stringPtr < end; stringPtr += len) { + int fullchar; len = TclUtfToUniChar(stringPtr, &ch); + fullchar = ch; + +#if TCL_UTF_MAX == 4 + if (!len) { + len += TclUtfToUniChar(stringPtr, &ch); + fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } +#endif /* * Assume Tcl_UniChar is an integral type... */ - hPtr = Tcl_CreateHashEntry(&charReuseTable, INT2PTR((int) ch), + hPtr = Tcl_CreateHashEntry(&charReuseTable, INT2PTR(fullchar), &isNew); if (isNew) { TclNewStringObj(objPtr, stringPtr, len); @@ -1783,8 +1792,16 @@ StringIsCmd( } end = string1 + length1; for (; string1 < end; string1 += length2, failat++) { + int fullchar; length2 = TclUtfToUniChar(string1, &ch); - if (!chcomp(ch)) { + fullchar = ch; +#if TCL_UTF_MAX == 4 + if (!length2) { + length2 = TclUtfToUniChar(string1, &ch); + fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } +#endif + if (!chcomp(fullchar)) { result = 0; break; } diff --git a/tests/split.test b/tests/split.test index 778131f..18055b3 100644 --- a/tests/split.test +++ b/tests/split.test @@ -70,6 +70,9 @@ test split-1.13 {basic split commands} { test split-1.14 {basic split commands} { split ",12,,,34,56," {,} } {{} 12 {} {} 34 56 {}} +test split-1.15 {basic split commands} -body { + split "a\U01f4a9b" {} +} -result "a \U01f4a9 b" test split-2.1 {split errors} { list [catch split msg] $msg $errorCode -- cgit v0.12