From 4b0626ddd7e1f7450781deb2508d94a98c8db93d Mon Sep 17 00:00:00 2001 From: dgp Date: Fri, 17 Apr 2020 16:56:51 +0000 Subject: Bring back the test utf-2.11; it fails in a TCL_UTF_MAX=4 build. --- tests/utf.test | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/utf.test b/tests/utf.test index 7b7b5c2..a22dafe 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -13,6 +13,8 @@ if {[lsearch [namespace children] ::tcltest] == -1} { namespace import -force ::tcltest::* } +testConstraint testbytestring [llength [info commands testbytestring]] + catch {unset x} test utf-1.1 {Tcl_UniCharToUtf: 1 byte sequences} { @@ -59,6 +61,12 @@ test utf-2.8 {Tcl_UtfToUniChar: longer UTF sequences not supported} { string length [bytestring "\xF4\xA2\xA2\xA2"] } {4} +test utf-2.11 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, invalid} testbytestring { + # Would decode to U+110000 but that is outside the Unicode range. + string length [testbytestring "\xF4\x90\x80\x80"] +} {4} + + test utf-3.1 {Tcl_UtfCharComplete} { } {} -- cgit v0.12 From 1e5043ff451573bf735f6aec84208af7f0c24cc2 Mon Sep 17 00:00:00 2001 From: dgp Date: Fri, 17 Apr 2020 20:15:07 +0000 Subject: Backport a collection of tests for consistency between branches. --- tests/utf.test | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/utf.test b/tests/utf.test index a22dafe..ff4f4a9 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -57,15 +57,22 @@ test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} { test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} { string length [bytestring "\xE4\xb9\x8e"] } {1} -test utf-2.8 {Tcl_UtfToUniChar: longer UTF sequences not supported} { - string length [bytestring "\xF4\xA2\xA2\xA2"] +test utf-2.8 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring} -body { + string length [testbytestring "\xF0\x90\x80\x80"] +} -result {4} +test utf-2.9 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring} -body { + string length [testbytestring "\xF4\x8F\xBF\xBF"] +} -result {4} +test utf-2.10 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, underflow} testbytestring { + string length [testbytestring "\xF0\x8F\xBF\xBF"] } {4} - -test utf-2.11 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, invalid} testbytestring { +test utf-2.11 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, overflow} testbytestring { # Would decode to U+110000 but that is outside the Unicode range. string length [testbytestring "\xF4\x90\x80\x80"] } {4} - +test utf-2.12 {Tcl_UtfToUniChar: longer UTF sequences not supported} testbytestring { + string length [testbytestring "\xF8\xA2\xA2\xA2\xA2"] +} {5} test utf-3.1 {Tcl_UtfCharComplete} { } {} -- cgit v0.12 From 7067acad796b0536c589101c3f61fbae9fd268aa Mon Sep 17 00:00:00 2001 From: dgp Date: Fri, 17 Apr 2020 20:23:49 +0000 Subject: Corrections for many tests, changing lead byte \xF4 to \xF2. The tested sequences were always intended to be valid 4-byte sequences. Also a few errors with greedy \xHHHH . --- tests/utf.test | 92 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/tests/utf.test b/tests/utf.test index ff4f4a9..7953a68 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -149,7 +149,7 @@ test utf-6.6 {Tcl_UtfNext} testutfnext { testutfnext A\xE8 } 1 test utf-6.7 {Tcl_UtfNext} testutfnext { - testutfnext A\xF4 + testutfnext A\xF2 } 1 test utf-6.8 {Tcl_UtfNext} testutfnext { testutfnext A\xF8 @@ -170,7 +170,7 @@ test utf-6.13 {Tcl_UtfNext} testutfnext { testutfnext \xA0\xE8 } 1 test utf-6.14 {Tcl_UtfNext} testutfnext { - testutfnext \xA0\xF4 + testutfnext \xA0\xF2 } 1 test utf-6.15 {Tcl_UtfNext} testutfnext { testutfnext \xA0\xF8 @@ -179,7 +179,7 @@ test utf-6.16 {Tcl_UtfNext} testutfnext { testutfnext \xD0 } 1 test utf-6.17 {Tcl_UtfNext} testutfnext { - testutfnext \xD0A + testutfnext \xD0G } 1 test utf-6.18 {Tcl_UtfNext} testutfnext { testutfnext \xD0\xA0 @@ -191,7 +191,7 @@ test utf-6.20 {Tcl_UtfNext} testutfnext { testutfnext \xD0\xE8 } 1 test utf-6.21 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xF4 + testutfnext \xD0\xF2 } 1 test utf-6.22 {Tcl_UtfNext} testutfnext { testutfnext \xD0\xF8 @@ -200,7 +200,7 @@ test utf-6.23 {Tcl_UtfNext} testutfnext { testutfnext \xE8 } 1 test utf-6.24 {Tcl_UtfNext} testutfnext { - testutfnext \xE8A + testutfnext \xE8G } 1 test utf-6.25 {Tcl_UtfNext} testutfnext { testutfnext \xE8\xA0 @@ -212,37 +212,37 @@ test utf-6.27 {Tcl_UtfNext} testutfnext { testutfnext \xE8\xE8 } 1 test utf-6.28 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xF4 + testutfnext \xE8\xF2 } 1 test utf-6.29 {Tcl_UtfNext} testutfnext { testutfnext \xE8\xF8 } 1 test utf-6.30 {Tcl_UtfNext} testutfnext { - testutfnext \xF4 + testutfnext \xF2 } 1 test utf-6.31 {Tcl_UtfNext} testutfnext { - testutfnext \xF4A + testutfnext \xF2G } 1 test utf-6.32 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0 + testutfnext \xF2\xA0 } 1 test utf-6.33 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xD0 + testutfnext \xF2\xD0 } 1 test utf-6.34 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xE8 + testutfnext \xF2\xE8 } 1 test utf-6.35 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xF4 + testutfnext \xF2\xF2 } 1 test utf-6.36 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xF8 + testutfnext \xF2\xF8 } 1 test utf-6.37 {Tcl_UtfNext} testutfnext { testutfnext \xF8 } 1 test utf-6.38 {Tcl_UtfNext} testutfnext { - testutfnext \xF8A + testutfnext \xF8G } 1 test utf-6.39 {Tcl_UtfNext} testutfnext { testutfnext \xF8\xA0 @@ -254,7 +254,7 @@ test utf-6.41 {Tcl_UtfNext} testutfnext { testutfnext \xF8\xE8 } 1 test utf-6.42 {Tcl_UtfNext} testutfnext { - testutfnext \xF8\xF4 + testutfnext \xF8\xF2 } 1 test utf-6.43 {Tcl_UtfNext} testutfnext { testutfnext \xF8\xF8 @@ -272,7 +272,7 @@ test utf-6.47 {Tcl_UtfNext} testutfnext { testutfnext \xD0\xA0\xE8 } 2 test utf-6.48 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xA0\xF4 + testutfnext \xD0\xA0\xF2 } 2 test utf-6.49 {Tcl_UtfNext} testutfnext { testutfnext \xD0\xA0\xF8 @@ -290,28 +290,28 @@ test utf-6.53 {Tcl_UtfNext} testutfnext { testutfnext \xE8\xA0\xE8 } 1 test utf-6.54 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xF4 + testutfnext \xE8\xA0\xF2 } 1 test utf-6.55 {Tcl_UtfNext} testutfnext { testutfnext \xE8\xA0\xF8 } 1 test utf-6.56 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0G + testutfnext \xF2\xA0G } 1 test utf-6.57 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0 + testutfnext \xF2\xA0\xA0 } 1 test utf-6.58 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xD0 + testutfnext \xF2\xA0\xD0 } 1 test utf-6.59 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xE8 + testutfnext \xF2\xA0\xE8 } 1 test utf-6.60 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xF4 + testutfnext \xF2\xA0\xF2 } 1 test utf-6.61 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xF8 + testutfnext \xF2\xA0\xF8 } 1 test utf-6.62 {Tcl_UtfNext} testutfnext { testutfnext \xE8\xA0\xA0G @@ -326,46 +326,46 @@ test utf-6.65 {Tcl_UtfNext} testutfnext { testutfnext \xE8\xA0\xA0\xE8 } 3 test utf-6.66 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xA0\xF4 + testutfnext \xE8\xA0\xA0\xF2 } 3 test utf-6.67 {Tcl_UtfNext} testutfnext { testutfnext \xE8\xA0\xA0\xF8 } 3 test utf-6.68 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0G + testutfnext \xF2\xA0\xA0G } 1 test utf-6.69 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xA0 + testutfnext \xF2\xA0\xA0\xA0 } 1 test utf-6.70 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xD0 + testutfnext \xF2\xA0\xA0\xD0 } 1 test utf-6.71 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xE8 + testutfnext \xF2\xA0\xA0\xE8 } 1 test utf-6.71 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xF4 + testutfnext \xF2\xA0\xA0\xF2 } 1 test utf-6.73 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xF8 + testutfnext \xF2\xA0\xA0\xF8 } 1 test utf-6.74 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xA0G + testutfnext \xF2\xA0\xA0\xA0G } 1 test utf-6.75 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xA0\xA0 + testutfnext \xF2\xA0\xA0\xA0\xA0 } 1 test utf-6.76 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xA0\xD0 + testutfnext \xF2\xA0\xA0\xA0\xD0 } 1 test utf-6.77 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xA0\xE8 + testutfnext \xF2\xA0\xA0\xA0\xE8 } 1 test utf-6.78 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xA0\xF4 + testutfnext \xF2\xA0\xA0\xA0\xF2 } 1 test utf-6.79 {Tcl_UtfNext} testutfnext { - testutfnext \xF4\xA0\xA0\xA0G\xF8 + testutfnext \xF2\xA0\xA0\xA0G\xF8 } 1 test utf-6.80 {Tcl_UtfNext - overlong sequences} testutfnext { testutfnext \xC0\x80 @@ -425,13 +425,13 @@ test utf-7.4.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xF8\xA0\xA0 2 } 1 test utf-7.5 {Tcl_UtfPrev} testutfprev { - testutfprev A\xF4 + testutfprev A\xF2 } 1 test utf-7.5.1 {Tcl_UtfPrev} testutfprev { - testutfprev A\xF4\xA0\xA0\xA0 2 + testutfprev A\xF2\xA0\xA0\xA0 2 } 1 test utf-7.5.2 {Tcl_UtfPrev} testutfprev { - testutfprev A\xF4\xF8\xA0\xA0 2 + testutfprev A\xF2\xF8\xA0\xA0 2 } 1 test utf-7.6 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8 @@ -470,13 +470,13 @@ test utf-7.9.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xF8\xA0 3 } 2 test utf-7.10 {Tcl_UtfPrev} testutfprev { - testutfprev A\xF4\xA0 + testutfprev A\xF2\xA0 } 2 test utf-7.10.1 {Tcl_UtfPrev} testutfprev { - testutfprev A\xF4\xA0\xA0\xA0 3 + testutfprev A\xF2\xA0\xA0\xA0 3 } 2 test utf-7.10.2 {Tcl_UtfPrev} testutfprev { - testutfprev A\xF4\xA0\xF8\xA0 3 + testutfprev A\xF2\xA0\xF8\xA0 3 } 2 test utf-7.11 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0 @@ -518,13 +518,13 @@ test utf-7.14.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xF8 4 } 3 test utf-7.15 {Tcl_UtfPrev} testutfprev { - testutfprev A\xF4\xA0\xA0 + testutfprev A\xF2\xA0\xA0 } 3 test utf-7.15.1 {Tcl_UtfPrev} testutfprev { - testutfprev A\xF4\xA0\xA0\xA0 4 + testutfprev A\xF2\xA0\xA0\xA0 4 } 3 test utf-7.15.2 {Tcl_UtfPrev} testutfprev { - testutfprev A\xF4\xA0\xA0\xF8 4 + testutfprev A\xF2\xA0\xA0\xF8 4 } 3 test utf-7.16 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0\xA0 @@ -557,7 +557,7 @@ test utf-7.19 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xA0 } 4 test utf-7.20 {Tcl_UtfPrev} testutfprev { - testutfprev A\xF4\xA0\xA0\xA0 + testutfprev A\xF2\xA0\xA0\xA0 } 4 test utf-7.21 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0\xA0\xA0 -- cgit v0.12 From e19b1f1306ccd36f01270f8594f9315dbdf39846 Mon Sep 17 00:00:00 2001 From: dgp Date: Fri, 17 Apr 2020 20:38:38 +0000 Subject: [493dccc2de] Coverage that Tcl_UtfPrev also checks the upper range validity. --- tests/utf.test | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/utf.test b/tests/utf.test index 7953a68..6d87928 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -652,6 +652,30 @@ test utf-7.47.1 {Tcl_UtfPrev, pointing to 3th byte of 3-byte valid sequence} {te test utf-7.47.2 {Tcl_UtfPrev, pointing to 3th byte of 3-byte invalid sequence} {testutfprev} { testutfprev \xE8\xA0\x00 2 } 0 +test utf-7.48 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { + testutfprev A\xF4\x8F\xBF\xBF +} 4 +test utf-7.48.1 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { + testutfprev A\xF4\x8F\xBF\xBF 4 +} 3 +test utf-7.48.2 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { + testutfprev A\xF4\x8F\xBF\xBF 3 +} 2 +test utf-7.48.3 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { + testutfprev A\xF4\x8F\xBF\xBF 2 +} 1 +test utf-7.49 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { + testutfprev A\xF4\x90\x80\x80 +} 4 +test utf-7.49.1 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { + testutfprev A\xF4\x90\x80\x80 4 +} 3 +test utf-7.49.2 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { + testutfprev A\xF4\x90\x80\x80 3 +} 2 +test utf-7.49.3 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { + testutfprev A\xF4\x90\x80\x80 2 +} 1 test utf-8.1 {Tcl_UniCharAtIndex: index = 0} { string index abcd 0 -- cgit v0.12 From 4520aa1ca30a7b09dc9cfc4bc9007aa262793711 Mon Sep 17 00:00:00 2001 From: dgp Date: Fri, 17 Apr 2020 21:03:30 +0000 Subject: More tests explicitly for Tcl_UtfNext near validity boundary U+110000 --- tests/utf.test | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/utf.test b/tests/utf.test index 6d87928..01e0bb2 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -403,6 +403,12 @@ test utf-6.89 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {te test utf-6.89.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext} { testutfnext \xF0\x80\x80 1 } 2 +test utf-6.90 {Tcl_UtfNext, validity check [493dccc2de]} testutfnext { + testutfnext \xF4\x8F\xBF\xBF +} 1 +test utf-6.91 {Tcl_UtfNext, validity check [493dccc2de]} testutfnext { + testutfnext \xF4\x90\x80\x80 +} 1 testConstraint testutfprev [llength [info commands testutfprev]] -- cgit v0.12 From aa9bb7f9e401573bc8c79e8336fdb74636b2702f Mon Sep 17 00:00:00 2001 From: dgp Date: Fri, 17 Apr 2020 21:07:09 +0000 Subject: [493dccc2de] Revise sequence validity check to reject out of range decodes too. --- generic/tclUtf.c | 53 +++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/generic/tclUtf.c b/generic/tclUtf.c index b5c430b..1883804 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -81,7 +81,7 @@ static CONST unsigned char totalBytes[256] = { */ static int UtfCount(int ch); -static int Overlong(unsigned char *src); +static int Invalid(unsigned char *src); /* *--------------------------------------------------------------------------- @@ -120,51 +120,52 @@ UtfCount( /* *--------------------------------------------------------------------------- * - * Overlong -- + * Invalid -- * * Utility routine to report whether /src/ points to the start of an - * overlong byte sequence that should be rejected. Caller guarantees - * that src[0] and src[1] are readable, and + * invald byte sequence that should be rejected. This might be because + * it is an overlong encoding, or because it encodes something out of + * the proper range. Caller guarantees that src[0] and src[1] are + * readable, and * * (src[0] >= 0xC0) && (src[0] != 0xC1) * (src[1] >= 0x80) && (src[1] < 0xC0) - * (src[0] < ((TCL_UTF_MAX > 3) ? 0xF8 : 0xF0)) + * (src[0] < ((TCL_UTF_MAX > 3) ? 0xF5 : 0xF0)) * * Results: * A boolean. *--------------------------------------------------------------------------- */ -static CONST unsigned char overlong[3] = { - 0x80, /* \xD0 -- all sequences valid */ - 0xA0, /* \xE0\x80 through \xE0\x9F are invalid prefixes */ +static CONST unsigned char bounds[28] = { + 0x80, 0x80, /* \xC0 accepts \x80 only */ + 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, + 0x80, 0xBF, /* (\xC4 - \xDC) -- all sequences valid */ + 0xA0, 0xBF, /* \xE0\x80 through \xE0\x9F are invalid prefixes */ + 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, /* (\xE4 - \xEC) -- all valid */ #if TCL_UTF_MAX > 3 - 0x90 /* \xF0\x80 through \xF0\x8F are invalid prefixes */ + 0x90, 0xBF, /* \xF0\x80 through \xF0\x8F are invalid prefixes */ + 0x80, 0x8F /* \xF4\x90 and higher are invalid prefixes */ #else - 0xC0 /* Not used, but reject all again for safety. */ + 0xC0, 0xBF, /* Not used, but reject all again for safety. */ + 0xC0, 0xBF /* Not used, but reject all again for safety. */ #endif }; INLINE static int -Overlong( +Invalid( unsigned char *src) /* Points to lead byte of a UTF-8 byte sequence */ { unsigned char byte = *src; + int index; - if (byte % 0x10) { - /* Only lead bytes 0xC0, 0xE0, 0xF0 need examination */ + if (byte % 0x04) { + /* Only lead bytes 0xC0, 0xE0, 0xF0, 0xF4 need examination */ return 0; } - if (byte == 0xC0) { - if (src[1] == 0x80) { - /* Valid sequence: \xC0\x80 for \u0000 */ - return 0; - } - /* Reject overlong: \xC0\x81 - \xC0\xBF */ - return 1; - } - if (src[1] < overlong[(byte >> 4) - 0x0D]) { - /* Reject overlong */ + index = (byte - 0xC0) >> 1; + if (src[1] < bounds[index] || src[1] > bounds[index+1]) { + /* Out of bounds - report invalid. */ return 1; } return 0; @@ -733,7 +734,7 @@ Tcl_UtfNext( } next++; } - if (Overlong((unsigned char *)src)) { + if (Invalid((unsigned char *)src)) { return src + 1; } return next; @@ -843,10 +844,10 @@ Tcl_UtfPrev( /* * trailBytesSeen > 0, so we can examine look[1] safely. - * Use that capability to screen out overlong sequences. + * Use that capability to screen out invalid sequences. */ - if (Overlong(look)) { + if (Invalid(look)) { /* Reject */ return fallback; } -- cgit v0.12 From ec1723eeadcf9efe52b0f81a65d683dff9b160c5 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sat, 18 Apr 2020 12:46:54 +0000 Subject: Update documentation of Tcl_UtfPrev/Tcl_UtfNext back to how it was. Will be updated later, when implementation is ready and agreed upon. --- doc/Utf.3 | 37 +++++++++++------------------ generic/tclUtf.c | 72 ++++++++++++++++++++------------------------------------ 2 files changed, 39 insertions(+), 70 deletions(-) diff --git a/doc/Utf.3 b/doc/Utf.3 index cb82699..334fa6f 100644 --- a/doc/Utf.3 +++ b/doc/Utf.3 @@ -3,7 +3,7 @@ '\" '\" See the file "license.terms" for information on usage and redistribution '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. -'\" +'\" .TH Utf 3 "8.1" Tcl "Tcl Library Procedures" .so man.macros .BS @@ -13,7 +13,7 @@ Tcl_UniChar, Tcl_UniCharToUtf, Tcl_UtfToUniChar, Tcl_UniCharToUtfDString, Tcl_Ut .nf \fB#include \fR .sp -typedef ... Tcl_UniChar; +typedef ... \fBTcl_UniChar\fR; .sp int \fBTcl_UniCharToUtf\fR(\fIch, buf\fR) @@ -48,7 +48,7 @@ int int \fBTcl_UtfCharComplete\fR(\fIsrc, length\fR) .sp -int +int \fBTcl_NumUtfChars\fR(\fIsrc, length\fR) .sp const char * @@ -109,7 +109,7 @@ Pointer to the beginning of a UTF-8 string. .AP int index in The index of a character (not byte) in the UTF-8 string. .AP int *readPtr out -If non-NULL, filled with the number of bytes in the backslash sequence, +If non-NULL, filled with the number of bytes in the backslash sequence, including the backslash character. .AP char *dst out Buffer in which the bytes represented by the backslash sequence are stored. @@ -141,8 +141,8 @@ source buffer is long enough such that this routine does not run off the end and dereference non-existent or random memory; if the source buffer is known to be null-terminated, this will not happen. If the input is not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first -byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0000 and -0x00FF and return 1. +byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0080 and +0x00FF and return 1. .PP \fBTcl_UniCharToUtfDString\fR converts the given Unicode string to UTF-8, storing the result in a previously initialized \fBTcl_DString\fR. @@ -210,27 +210,18 @@ length is negative, all bytes up to the first null byte are used. \fBTcl_UtfFindFirst\fR corresponds to \fBstrchr\fR for UTF-8 strings. It returns a pointer to the first occurrence of the Tcl_UniChar \fIch\fR in the null-terminated UTF-8 string \fIsrc\fR. The null terminator is -considered part of the UTF-8 string. +considered part of the UTF-8 string. .PP \fBTcl_UtfFindLast\fR corresponds to \fBstrrchr\fR for UTF-8 strings. It returns a pointer to the last occurrence of the Tcl_UniChar \fIch\fR in the null-terminated UTF-8 string \fIsrc\fR. The null terminator is -considered part of the UTF-8 string. +considered part of the UTF-8 string. .PP -\fBTcl_UtfNext\fR is used to step forward through a UTF-8 string. -If the UTF-8 string is made up entirely of complete, well-formed, and -valid character byte sequences, and \fIsrc\fR points to the lead byte -of one of those sequences, then repeated calls of \fBTcl_UtfNext\fR will -return pointers to the lead bytes of each character in the string, one -character at a time. In any other circumstance, \fBTcl_UtfNext\fR -returns \fIsrc\fR+1. \fBTcl_UtfNext\fR will always read \fIsrc[0]\fR -and may read as many following bytes (up to a total of \fBTCL_UTF_MAX\fR) -as needed to find the end of the byte sequence. If the string is -\fBNUL\fR-terminated, \fBTcl_UtfNext\fR will not read beyond the terminating -\fBNUL\fR byte. If not, the caller must use the companion routine -\fBTcl_UtfCharComplete\fR to determine whether there is any risk -\fBTcl_UtfNext\fR might read beyond the readable memory occupied -by the string. +Given \fIsrc\fR, a pointer to some location in a UTF-8 string, +\fBTcl_UtfNext\fR returns a pointer to the next UTF-8 character in the +string. The caller must not ask for the next character after the last +character in the string if the string is not terminated by a null +character. .PP \fBTcl_UtfPrev\fR is used to step backward through but not beyond the UTF-8 string that begins at \fIstart\fR. If the UTF-8 string is made @@ -262,7 +253,7 @@ characters. Behavior is undefined if a negative \fIindex\fR is given. .PP \fBTcl_UtfAtIndex\fR returns a pointer to the specified character (not byte) \fIindex\fR in the UTF-8 string \fIsrc\fR. The source string must -contain at least \fIindex\fR characters. This is equivalent to calling +contain at least \fIindex\fR characters. This is equivalent to calling \fBTcl_UtfNext\fR \fIindex\fR times. If a negative \fIindex\fR is given, the return pointer points to the first character in the source string. .PP diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 1883804..64ee0a8 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -678,35 +678,13 @@ Tcl_UtfFindLast( * * Tcl_UtfNext -- * - * The aim of this routine is to provide a way to iterate forward - * through a UTF-8 string. The caller is expected to pass a non-NULL - * pointer argument /src/ which points to a location within a string. - * (*src) will be read, so /src/ must not point to an unreadable - * location past the end of the string. If /src/ points to the - * beginning of a complete, well-formed and valid UTF_8 byte sequence - * of no more than TCL_UTF_MAX bytes, Tcl_UtfNext returns the pointer - * just past the end of that sequence. In any other circumstance, - * Tcl_UtfNext returns /src/+1. - * - * Because this routine always returns a value > /src/, it is useful - * as a forward iterator that will always make progress. If the string - * is NUL-terminated, Tcl_UtfNext will not read beyond the terminating - * NUL character. If it is not NUL-terminated, the caller must make - * use of the companion routine Tcl_UtfCharComplete to test whether - * there is risk that Tcl_UtfNext will read beyond the end of the string. - * Tcl_UtfNext will never read more than TCL_UTF_MAX bytes. - * - * In a string where all characters are complete and properly formed, - * and /src/ points to the first byte of a character, repeated - * Tcl_UtfNext calls will step to the starting bytes of characters, one - * character at a time. Within those limitations, Tcl_UtfPrev and - * Tcl_UtfNext are inverses. If either condition cannot be met, - * Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and the - * caller will have to take greater care. + * Given a pointer to some current location in a UTF-8 string, move + * forward one character. The caller must ensure that they are not asking + * for the next character after the last character in the string. * * Results: - * A pointer to the start of the next character in the string (or to - * the end of the string) as described above. + * The return value is the pointer to the next character in the UTF-8 + * string. * * Side effects: * None. @@ -747,37 +725,37 @@ Tcl_UtfNext( * * The aim of this routine is to provide a way to move backward * through a UTF-8 string. The caller is expected to pass non-NULL - * pointer arguments /start/ and /src/. /start/ points to the beginning - * of a string, and /src/ (>= /start/) points to a location within (or - * just past the end) of the string. This routine always returns a - * pointer within the string (>= /start/). When (/src/ == /start/), - * it returns /start/. When (/src/ > /start/), it returns a pointer - * (< /src/) and (>= /src/ - TCL_UTF_MAX). Subject to these constraints, - * the routine returns a pointer to the earliest byte in the string that - * starts a character when characters are read starting at /start/ and + * pointer arguments start and src. start points to the beginning + * of a string, and src >= start points to a location within (or just + * past the end) of the string. This routine always returns a + * pointer within the string (>= start). When (src == start), it + * returns start. When (src > start), it returns a pointer (< src) + * and (>= src - TCL_UTF_MAX). Subject to these constraints, the + * routine returns a pointer to the earliest byte in the string that + * starts a character when characters are read starting at start and * that character might include the byte src[-1]. The routine will * examine only those bytes in the range that might be returned. - * It will not examine the byte (*src), and because of that cannot + * It will not examine the byte *src, and because of that cannot * determine for certain in all circumstances whether the character * that begins with the returned pointer will or will not include - * the byte src[-1]. In the scenario where /src/ points to the end of - * a buffer being filled, the returned pointer points to either the + * the byte src[-1]. In the scenario, where src points to the end of + * a buffer being filled, the returned pointer point to either the * final complete character in the string or to the earliest byte * that might start an incomplete character waiting for more bytes to * complete. * - * Because this routine always returns a value < /src/ until the point - * it is forced to return /start/, it is useful as a backward iterator + * Because this routine always returns a value < src until the point + * it is forced to return start, it is useful as a backward iterator * through a string that will always make progress and always be * prevented from running past the beginning of the string. * * In a string where all characters are complete and properly formed, - * and /src/ points to the first byte of a character, repeated - * Tcl_UtfPrev calls will step to the starting bytes of characters, one - * character at a time. Within those limitations, Tcl_UtfPrev and - * Tcl_UtfNext are inverses. If either condition cannot be met, - * Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and the - * caller will have to take greater care. + * and the value of src points to the first byte of a character, + * repeated Tcl_UtfPrev calls will step to the starting bytes of + * characters, one character at a time. Within those limitations, + * Tcl_UtfPrev and Tcl_UtfNext are inverses. If either condition cannot + * be met, Tcl_UtfPrev and Tcl_UtfNext may not function as inverses and + * the caller will have to take greater care. * * Results: * A pointer to the start of a character in the string as described @@ -887,7 +865,7 @@ Tcl_UtfPrev( * * Tcl_UniCharAtIndex -- * - * Returns the Unicode character represented at the specified character + * Returns the Tcl_UniChar represented at the specified character * (not byte) position in the UTF-8 string. * * Results: -- cgit v0.12 From 6f00fef31d332688308f392fd5df4cab98d05161 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sat, 18 Apr 2020 13:47:06 +0000 Subject: Fix [c574e50a3b30e76f]: CRASH: utf-2.[89] in 8.5 built with TCL_UTF_MAX=4 --- generic/regcustom.h | 2 +- generic/tcl.h | 2 +- generic/tclUtf.c | 84 +---------------- tests/utf.test | 259 ++++++++++++++++++++++++++-------------------------- tests/util.test | 1 + 5 files changed, 132 insertions(+), 216 deletions(-) diff --git a/generic/regcustom.h b/generic/regcustom.h index 57a2d47..ac33087 100644 --- a/generic/regcustom.h +++ b/generic/regcustom.h @@ -97,7 +97,7 @@ typedef int celt; /* Type to hold chr, or NOCELT */ #define NOCELT (-1) /* Celt value which is not valid chr */ #define CHR(c) (UCHAR(c)) /* Turn char literal into chr literal */ #define DIGITVAL(c) ((c)-'0') /* Turn chr digit into its value */ -#if TCL_UTF_MAX > 4 +#if TCL_UTF_MAX > 3 #define CHRBITS 32 /* Bits in a chr; must not use sizeof */ #define CHR_MIN 0x00000000 /* Smallest and largest chr; the value */ #define CHR_MAX 0xffffffff /* CHR_MAX-CHR_MIN+1 should fit in uchr */ diff --git a/generic/tcl.h b/generic/tcl.h index 7378a8f..d7d064c 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -2148,7 +2148,7 @@ typedef struct Tcl_Parse { * reflected in regcustom.h. */ -#if TCL_UTF_MAX > 4 +#if TCL_UTF_MAX > 3 /* * unsigned int isn't 100% accurate as it should be a strict 4-byte value * (perhaps wchar_t). 64-bit systems may have troubles. The size of this diff --git a/generic/tclUtf.c b/generic/tclUtf.c index 64ee0a8..3741d70 100644 --- a/generic/tclUtf.c +++ b/generic/tclUtf.c @@ -209,30 +209,6 @@ Tcl_UniCharToUtf( return 2; } if (ch <= 0xFFFF) { -#if TCL_UTF_MAX == 4 - if ((ch & 0xF800) == 0xD800) { - if (ch & 0x0400) { - /* Low surrogate */ - if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80) - && ((buf[2] & 0xCF) == 0)) { - /* Previous Tcl_UniChar was a High surrogate, so combine */ - buf[3] = (char) ((ch & 0x3F) | 0x80); - buf[2] |= (char) (((ch >> 6) & 0x0F) | 0x80); - return 4; - } - /* Previous Tcl_UniChar was not a High surrogate, so just output */ - } else { - /* High surrogate */ - ch += 0x40; - /* Fill buffer with specific 3-byte (invalid) byte combination, - so following Low surrogate can recognize it and combine */ - buf[2] = (char) ((ch << 4) & 0x30); - buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80); - buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0); - return 0; - } - } -#endif goto three; } @@ -321,15 +297,6 @@ Tcl_UniCharToUtfDString( * Tcl_UtfCharComplete() before calling this routine to ensure that * enough bytes remain in the string. * - * If TCL_UTF_MAX == 4, special handling of Surrogate pairs is done: - * For any UTF-8 string containing a character outside of the BMP, the - * first call to this function will fill *chPtr with the high surrogate - * and generate a return value of 0. Calling Tcl_UtfToUniChar again - * will produce the low surrogate and a return value of 4. Because *chPtr - * is used to remember whether the high surrogate is already produced, it - * is recommended to initialize the variable it points to as 0 before - * the first call to Tcl_UtfToUniChar is done. - * * Results: * *chPtr is filled with the Tcl_UniChar, and the return value is the * number of bytes from the UTF-8 string that were consumed. @@ -402,34 +369,15 @@ Tcl_UtfToUniChar( /* * Four-byte-character lead byte followed by three trail bytes. */ -#if TCL_UTF_MAX == 4 - Tcl_UniChar surrogate; - - byte = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) - | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)) - 0x10000; - surrogate = (Tcl_UniChar) (0xD800 + (byte >> 10)); - if (byte & 0x100000) { - /* out of range, < 0x10000 or > 0x10ffff */ - } else if (*chPtr != surrogate) { - /* produce high surrogate, but don't advance source pointer */ - *chPtr = surrogate; - return 0; - } else { - /* produce low surrogate, and advance source pointer */ - *chPtr = (Tcl_UniChar) (0xDC00 | (byte & 0x3FF)); - return 4; - } -#else *chPtr = (Tcl_UniChar) (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) { return 4; } -#endif } /* - * A four-byte-character lead-byte not followed by two trail-bytes + * A four-byte-character lead-byte not followed by three trail-bytes * represents itself. */ } @@ -1230,16 +1178,6 @@ Tcl_UtfNcmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { -#if TCL_UTF_MAX == 4 - /* Surrogates always report higher than non-surrogates */ - if (((ch1 & 0xFC00) == 0xD800)) { - if ((ch2 & 0xFC00) != 0xD800) { - return ch1; - } - } else if ((ch2 & 0xFC00) == 0xD800) { - return -ch2; - } -#endif return (ch1 - ch2); } } @@ -1280,16 +1218,6 @@ Tcl_UtfNcasecmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { -#if TCL_UTF_MAX == 4 - /* Surrogates always report higher than non-surrogates */ - if (((ch1 & 0xFC00) == 0xD800)) { - if ((ch2 & 0xFC00) != 0xD800) { - return ch1; - } - } else if ((ch2 & 0xFC00) == 0xD800) { - return -ch2; - } -#endif ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); if (ch1 != ch2) { @@ -1329,16 +1257,6 @@ TclUtfCasecmp( cs += TclUtfToUniChar(cs, &ch1); ct += TclUtfToUniChar(ct, &ch2); if (ch1 != ch2) { -#if TCL_UTF_MAX == 4 - /* Surrogates always report higher than non-surrogates */ - if (((ch1 & 0xFC00) == 0xD800)) { - if ((ch2 & 0xFC00) != 0xD800) { - return ch1; - } - } else if ((ch2 & 0xFC00) == 0xD800) { - return -ch2; - } -#endif ch1 = Tcl_UniCharToLower(ch1); ch2 = Tcl_UniCharToLower(ch2); if (ch1 != ch2) { diff --git a/tests/utf.test b/tests/utf.test index 01e0bb2..9a55729 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -13,54 +13,61 @@ if {[lsearch [namespace children] ::tcltest] == -1} { namespace import -force ::tcltest::* } +testConstraint compat85 [expr {[format %c 0x010000] == "\uFFFD"}] testConstraint testbytestring [llength [info commands testbytestring]] +testConstraint testfindfirst [llength [info commands testfindfirst]] +testConstraint testfindlast [llength [info commands testfindlast]] +testConstraint testnumutfchars [llength [info commands testnumutfchars]] +testConstraint teststringobj [llength [info commands teststringobj]] +testConstraint testutfnext [llength [info commands testutfnext]] +testConstraint testutfprev [llength [info commands testutfprev]] catch {unset x} -test utf-1.1 {Tcl_UniCharToUtf: 1 byte sequences} { +test utf-1.1 {Tcl_UniCharToUtf: 1 byte sequences} testbytestring { set x \x01 -} [bytestring "\x01"] -test utf-1.2 {Tcl_UniCharToUtf: 2 byte sequences} { +} [testbytestring "\x01"] +test utf-1.2 {Tcl_UniCharToUtf: 2 byte sequences} testbytestring { set x "\x00" -} [bytestring "\xc0\x80"] -test utf-1.3 {Tcl_UniCharToUtf: 2 byte sequences} { +} [testbytestring "\xC0\x80"] +test utf-1.3 {Tcl_UniCharToUtf: 2 byte sequences} testbytestring { set x "\xe0" -} [bytestring "\xc3\xa0"] -test utf-1.4 {Tcl_UniCharToUtf: 3 byte sequences} { - set x "\u4e4e" -} [bytestring "\xe4\xb9\x8e"] -test utf-1.5 {Tcl_UniCharToUtf: overflowed Tcl_UniChar} { +} [testbytestring "\xC3\xA0"] +test utf-1.4 {Tcl_UniCharToUtf: 3 byte sequences} testbytestring { + set x "\u4E4E" +} [testbytestring "\xE4\xB9\x8E"] +test utf-1.5 {Tcl_UniCharToUtf: overflowed Tcl_UniChar} testbytestring { format %c 0x110000 -} [bytestring "\xef\xbf\xbd"] -test utf-1.6 {Tcl_UniCharToUtf: negative Tcl_UniChar} { +} [testbytestring "\xEF\xBF\xBD"] +test utf-1.6 {Tcl_UniCharToUtf: negative Tcl_UniChar} testbytestring { format %c -1 -} [bytestring "\xef\xbf\xbd"] +} [testbytestring "\xEF\xBF\xBD"] test utf-2.1 {Tcl_UtfToUniChar: low ascii} { string length "abc" } {3} -test utf-2.2 {Tcl_UtfToUniChar: naked trail bytes} { - string length [bytestring "\x82\x83\x84"] +test utf-2.2 {Tcl_UtfToUniChar: naked trail bytes} testbytestring { + string length [testbytestring "\x82\x83\x84"] } {3} -test utf-2.3 {Tcl_UtfToUniChar: lead (2-byte) followed by non-trail} { - string length [bytestring "\xC2"] +test utf-2.3 {Tcl_UtfToUniChar: lead (2-byte) followed by non-trail} testbytestring { + string length [testbytestring "\xC2"] } {1} -test utf-2.4 {Tcl_UtfToUniChar: lead (2-byte) followed by trail} { - string length [bytestring "\xC2\xa2"] +test utf-2.4 {Tcl_UtfToUniChar: lead (2-byte) followed by trail} testbytestring { + string length [testbytestring "\xC2\xA2"] } {1} -test utf-2.5 {Tcl_UtfToUniChar: lead (3-byte) followed by non-trail} { - string length [bytestring "\xE2"] +test utf-2.5 {Tcl_UtfToUniChar: lead (3-byte) followed by non-trail} testbytestring { + string length [testbytestring "\xE2"] } {1} -test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} { - string length [bytestring "\xE2\xA2"] +test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} testbytestring { + string length [testbytestring "\xE2\xA2"] } {2} -test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} { - string length [bytestring "\xE4\xb9\x8e"] +test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} testbytestring { + string length [testbytestring "\xE4\xb9\x8E"] } {1} -test utf-2.8 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring} -body { +test utf-2.8 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring compat85} -body { string length [testbytestring "\xF0\x90\x80\x80"] } -result {4} -test utf-2.9 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring} -body { +test utf-2.9 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring compat85} -body { string length [testbytestring "\xF4\x8F\xBF\xBF"] } -result {4} test utf-2.10 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, underflow} testbytestring { @@ -77,57 +84,51 @@ test utf-2.12 {Tcl_UtfToUniChar: longer UTF sequences not supported} testbytestr test utf-3.1 {Tcl_UtfCharComplete} { } {} -testConstraint testnumutfchars [llength [info commands testnumutfchars]] -testConstraint testfindfirst [llength [info commands testfindfirst]] -testConstraint testfindlast [llength [info commands testfindlast]] - test utf-4.1 {Tcl_NumUtfChars: zero length} testnumutfchars { testnumutfchars "" } {0} -test utf-4.2 {Tcl_NumUtfChars: length 1} testnumutfchars { - testnumutfchars [bytestring "\xC2\xA2"] +test utf-4.2 {Tcl_NumUtfChars: length 1} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\xC2\xA2"] } {1} -test utf-4.3 {Tcl_NumUtfChars: long string} testnumutfchars { - testnumutfchars [bytestring "abc\xC2\xA2\xe4\xb9\x8e\uA2\u4e4e"] +test utf-4.3 {Tcl_NumUtfChars: long string} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "abc\xC2\xA2\xE4\xB9\x8e\uA2\u4E4E"] } {7} -test utf-4.4 {Tcl_NumUtfChars: #u0000} testnumutfchars { - testnumutfchars [bytestring "\xC0\x80"] +test utf-4.4 {Tcl_NumUtfChars: #u0000} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\xC0\x80"] } {1} test utf-4.5 {Tcl_NumUtfChars: zero length, calc len} testnumutfchars { testnumutfchars "" 0 } {0} -test utf-4.6 {Tcl_NumUtfChars: length 1, calc len} testnumutfchars { - testnumutfchars [bytestring "\xC2\xA2"] 1 +test utf-4.6 {Tcl_NumUtfChars: length 1, calc len} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\xC2\xA2"] 1 } {1} -test utf-4.7 {Tcl_NumUtfChars: long string, calc len} testnumutfchars { - testnumutfchars [bytestring "abc\xC2\xA2\xe4\xb9\x8e\uA2\u4e4e"] 10 +test utf-4.7 {Tcl_NumUtfChars: long string, calc len} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "abc\xC2\xA2\xE4\xB9\x8E\uA2\u4E4E"] 10 } {7} -test utf-4.8 {Tcl_NumUtfChars: #u0000, calc len} testnumutfchars { - testnumutfchars [bytestring "\xC0\x80"] 1 +test utf-4.8 {Tcl_NumUtfChars: #u0000, calc len} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\xC0\x80"] 1 } {1} # Bug [2738427]: Tcl_NumUtfChars(...) no overflow check -test utf-4.9 {Tcl_NumUtfChars: #u20AC, calc len, incomplete} testnumutfchars { - testnumutfchars [bytestring "\xE2\x82\xAC"] 2 +test utf-4.9 {Tcl_NumUtfChars: #u20AC, calc len, incomplete} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\xE2\x82\xAC"] 2 } {2} -test utf-4.10 {Tcl_NumUtfChars: #u0000, calc len, overcomplete} testnumutfchars { - testnumutfchars [bytestring "\x00"] 2 +test utf-4.10 {Tcl_NumUtfChars: #u0000, calc len, overcomplete} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\x00"] 2 } {2} -test utf-4.11 {Tcl_NumUtfChars: 3 bytes of 4-byte UTF-8 characater} testnumutfchars { - testnumutfchars [bytestring \xf0\x9f\x92\xa9] 3 +test utf-4.11 {Tcl_NumUtfChars: 3 bytes of 4-byte UTF-8 characater} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring \xf0\x9f\x92\xA9] 3 } {3} -test utf-4.12 {Tcl_NumUtfChars: #4-byte UTF-8 character} testnumutfchars { - testnumutfchars [bytestring \xf0\x9f\x92\xa9] 4 +test utf-4.12 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring compat85} { + testnumutfchars [testbytestring \xf0\x9f\x92\xA9] 4 } {4} -test utf-5.1 {Tcl_UtfFindFirst} testfindfirst { - testfindfirst [bytestring "abcbc"] 98 +test utf-5.1 {Tcl_UtfFindFirst} {testfindfirst testbytestring} { + testfindfirst [testbytestring "abcbc"] 98 } {bcbc} -test utf-5.2 {Tcl_UtfFindLast} testfindlast { - testfindlast [bytestring "abcbc"] 98 +test utf-5.2 {Tcl_UtfFindLast} {testfindlast testbytestring} { + testfindlast [testbytestring "abcbc"] 98 } {bc} -testConstraint testutfnext [llength [info commands testutfnext]] - test utf-6.1 {Tcl_UtfNext} testutfnext { # This takes the pointer one past the terminating NUL. # This is really an invalid call. @@ -334,7 +335,7 @@ test utf-6.67 {Tcl_UtfNext} testutfnext { test utf-6.68 {Tcl_UtfNext} testutfnext { testutfnext \xF2\xA0\xA0G } 1 -test utf-6.69 {Tcl_UtfNext} testutfnext { +test utf-6.69 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0 } 1 test utf-6.70 {Tcl_UtfNext} testutfnext { @@ -349,22 +350,22 @@ test utf-6.71 {Tcl_UtfNext} testutfnext { test utf-6.73 {Tcl_UtfNext} testutfnext { testutfnext \xF2\xA0\xA0\xF8 } 1 -test utf-6.74 {Tcl_UtfNext} testutfnext { +test utf-6.74 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0G } 1 -test utf-6.75 {Tcl_UtfNext} testutfnext { +test utf-6.75 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0\xA0 } 1 -test utf-6.76 {Tcl_UtfNext} testutfnext { +test utf-6.76 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0\xD0 } 1 -test utf-6.77 {Tcl_UtfNext} testutfnext { +test utf-6.77 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0\xE8 } 1 -test utf-6.78 {Tcl_UtfNext} testutfnext { +test utf-6.78 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0\xF2 } 1 -test utf-6.79 {Tcl_UtfNext} testutfnext { +test utf-6.79 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0G\xF8 } 1 test utf-6.80 {Tcl_UtfNext - overlong sequences} testutfnext { @@ -388,7 +389,7 @@ test utf-6.85 {Tcl_UtfNext - overlong sequences} testutfnext { test utf-6.86 {Tcl_UtfNext - overlong sequences} testutfnext { testutfnext \xF0\x80\x80\x80 } 1 -test utf-6.87 {Tcl_UtfNext - overlong sequences} testutfnext { +test utf-6.87 {Tcl_UtfNext - overlong sequences} {testutfnext compat85} { testutfnext \xF0\x90\x80\x80 } 1 test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} { @@ -403,15 +404,13 @@ test utf-6.89 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {te test utf-6.89.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext} { testutfnext \xF0\x80\x80 1 } 2 -test utf-6.90 {Tcl_UtfNext, validity check [493dccc2de]} testutfnext { +test utf-6.90 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext compat85} { testutfnext \xF4\x8F\xBF\xBF } 1 test utf-6.91 {Tcl_UtfNext, validity check [493dccc2de]} testutfnext { testutfnext \xF4\x90\x80\x80 } 1 -testConstraint testutfprev [llength [info commands testutfprev]] - test utf-7.1 {Tcl_UtfPrev} testutfprev { testutfprev {} } 0 @@ -475,13 +474,13 @@ test utf-7.9.1 {Tcl_UtfPrev} testutfprev { test utf-7.9.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xF8\xA0 3 } 2 -test utf-7.10 {Tcl_UtfPrev} testutfprev { +test utf-7.10 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0 } 2 -test utf-7.10.1 {Tcl_UtfPrev} testutfprev { +test utf-7.10.1 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0\xA0 3 } 2 -test utf-7.10.2 {Tcl_UtfPrev} testutfprev { +test utf-7.10.2 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xF8\xA0 3 } 2 test utf-7.11 {Tcl_UtfPrev} testutfprev { @@ -523,13 +522,13 @@ test utf-7.14.1 {Tcl_UtfPrev} testutfprev { test utf-7.14.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xF8 4 } 3 -test utf-7.15 {Tcl_UtfPrev} testutfprev { +test utf-7.15 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0 } 3 -test utf-7.15.1 {Tcl_UtfPrev} testutfprev { +test utf-7.15.1 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0\xA0 4 } 3 -test utf-7.15.2 {Tcl_UtfPrev} testutfprev { +test utf-7.15.2 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0\xF8 4 } 3 test utf-7.16 {Tcl_UtfPrev} testutfprev { @@ -562,7 +561,7 @@ test utf-7.18.2 {Tcl_UtfPrev} testutfprev { test utf-7.19 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xA0 } 4 -test utf-7.20 {Tcl_UtfPrev} testutfprev { +test utf-7.20 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0\xA0 } 4 test utf-7.21 {Tcl_UtfPrev} testutfprev { @@ -622,16 +621,16 @@ test utf-7.36 {Tcl_UtfPrev -- overlong sequence} testutfprev { test utf-7.37 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xE0\xA0\x80 3 } 1 -test utf-7.38 {Tcl_UtfPrev -- overlong sequence} testutfprev { +test utf-7.38 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { testutfprev A\xE0\xA0\x80 2 } 1 -test utf-7.39 {Tcl_UtfPrev -- overlong sequence} testutfprev { +test utf-7.39 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { testutfprev A\xF0\x90\x80\x80 } 4 -test utf-7.40 {Tcl_UtfPrev -- overlong sequence} testutfprev { +test utf-7.40 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { testutfprev A\xF0\x90\x80\x80 4 } 3 -test utf-7.41 {Tcl_UtfPrev -- overlong sequence} testutfprev { +test utf-7.41 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { testutfprev A\xF0\x90\x80\x80 3 } 2 test utf-7.42 {Tcl_UtfPrev -- overlong sequence} testutfprev { @@ -658,13 +657,13 @@ test utf-7.47.1 {Tcl_UtfPrev, pointing to 3th byte of 3-byte valid sequence} {te test utf-7.47.2 {Tcl_UtfPrev, pointing to 3th byte of 3-byte invalid sequence} {testutfprev} { testutfprev \xE8\xA0\x00 2 } 0 -test utf-7.48 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { +test utf-7.48 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { testutfprev A\xF4\x8F\xBF\xBF } 4 -test utf-7.48.1 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { +test utf-7.48.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { testutfprev A\xF4\x8F\xBF\xBF 4 } 3 -test utf-7.48.2 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { +test utf-7.48.2 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { testutfprev A\xF4\x8F\xBF\xBF 3 } 2 test utf-7.48.3 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { @@ -708,18 +707,18 @@ test utf-10.1 {Tcl_UtfBackslash: dst == NULL} { set x \n } { } -test utf-10.2 {Tcl_UtfBackslash: \u subst} { - set x \ua2 -} [bytestring "\xc2\xa2"] -test utf-10.3 {Tcl_UtfBackslash: longer \u subst} { - set x \u4e21 -} [bytestring "\xe4\xb8\xa1"] -test utf-10.4 {Tcl_UtfBackslash: stops at first non-hex} { - set x \u4e2k -} "[bytestring \xd3\xa2]k" -test utf-10.5 {Tcl_UtfBackslash: stops after 4 hex chars} { - set x \u4e216 -} "[bytestring \xe4\xb8\xa1]6" +test utf-10.2 {Tcl_UtfBackslash: \u subst} testbytestring { + set x \uA2 +} [testbytestring "\xC2\xA2"] +test utf-10.3 {Tcl_UtfBackslash: longer \u subst} testbytestring { + set x \u4E21 +} [testbytestring "\xE4\xB8\xA1"] +test utf-10.4 {Tcl_UtfBackslash: stops at first non-hex} testbytestring { + set x \u4E2k +} "[testbytestring \xD3\xA2]k" +test utf-10.5 {Tcl_UtfBackslash: stops after 4 hex chars} testbytestring { + set x \u4E216 +} "[testbytestring \xE4\xB8\xA1]6" proc bsCheck {char num} { global errNum test utf-10.$errNum {backslash substitution} { @@ -774,11 +773,11 @@ test utf-11.2 {Tcl_UtfToUpper} { string toupper abc } ABC test utf-11.3 {Tcl_UtfToUpper} { - string toupper \u00e3ab -} \u00c3AB + string toupper \u00E3ab +} \u00C3AB test utf-11.4 {Tcl_UtfToUpper} { - string toupper \u01e3ab -} \u01e2AB + string toupper \u01E3ab +} \u01E2AB test utf-12.1 {Tcl_UtfToLower} { string tolower {} @@ -787,11 +786,11 @@ test utf-12.2 {Tcl_UtfToLower} { string tolower ABC } abc test utf-12.3 {Tcl_UtfToLower} { - string tolower \u00c3AB -} \u00e3ab + string tolower \u00C3AB +} \u00E3ab test utf-12.4 {Tcl_UtfToLower} { - string tolower \u01e2AB -} \u01e3ab + string tolower \u01E2AB +} \u01E3ab test utf-13.1 {Tcl_UtfToTitle} { string totitle {} @@ -800,11 +799,11 @@ test utf-13.2 {Tcl_UtfToTitle} { string totitle abc } Abc test utf-13.3 {Tcl_UtfToTitle} { - string totitle \u00e3ab -} \u00c3ab + string totitle \u00E3ab +} \u00C3ab test utf-13.4 {Tcl_UtfToTitle} { - string totitle \u01f3ab -} \u01f2ab + string totitle \u01F3ab +} \u01F2ab test utf-14.1 {Tcl_UtfNcasecmp} { string compare -nocase a b @@ -823,7 +822,7 @@ test utf-15.1 {Tcl_UniCharToUpper, negative delta} { string toupper aA } AA test utf-15.2 {Tcl_UniCharToUpper, positive delta} { - string toupper \u0178\u00ff + string toupper \u0178\xFF } \u0178\u0178 test utf-15.3 {Tcl_UniCharToUpper, no delta} { string toupper ! @@ -833,24 +832,24 @@ test utf-16.1 {Tcl_UniCharToLower, negative delta} { string tolower aA } aa test utf-16.2 {Tcl_UniCharToLower, positive delta} { - string tolower \u0178\u00ff\uA78D\u01c5 -} \u00ff\u00ff\u0265\u01c6 + string tolower \u0178\xFF\uA78D\u01C5 +} \xFF\xFF\u0265\u01C6 test utf-17.1 {Tcl_UniCharToLower, no delta} { string tolower ! } ! test utf-18.1 {Tcl_UniCharToTitle, add one for title} { - string totitle \u01c4 -} \u01c5 + string totitle \u01C4 +} \u01C5 test utf-18.2 {Tcl_UniCharToTitle, subtract one for title} { - string totitle \u01c6 -} \u01c5 + string totitle \u01C6 +} \u01C5 test utf-18.3 {Tcl_UniCharToTitle, subtract delta for title (positive)} { - string totitle \u017f + string totitle \u017F } \u0053 test utf-18.4 {Tcl_UniCharToTitle, subtract delta for title (negative)} { - string totitle \u00ff + string totitle \xFF } \u0178 test utf-18.5 {Tcl_UniCharToTitle, no delta} { string totitle ! @@ -865,15 +864,15 @@ test utf-20.1 {TclUniCharNcmp} { test utf-21.1 {TclUniCharIsAlnum} { # this returns 1 with Unicode 7 compliance - string is alnum \u1040\u021f\u0220 + string is alnum \u1040\u021F\u0220 } {1} test utf-21.2 {unicode alnum char in regc_locale.c} { # this returns 1 with Unicode 7 compliance - list [regexp {^[[:alnum:]]+$} \u1040\u021f\u0220] [regexp {^\w+$} \u1040\u021f\u0220_\u203f\u2040\u2054\ufe33\ufe34\ufe4d\ufe4e\ufe4f\uff3f] + list [regexp {^[[:alnum:]]+$} \u1040\u021F\u0220] [regexp {^\w+$} \u1040\u021F\u0220_\u203F\u2040\u2054\uFE33\uFE34\uFE4D\uFE4E\uFE4F\uFF3F] } {1 1} test utf-21.3 {unicode print char in regc_locale.c} { # this returns 1 with Unicode 7 compliance - regexp {^[[:print:]]+$} \ufbc1 + regexp {^[[:print:]]+$} \uFBC1 } 1 test utf-21.4 {TclUniCharIsGraph} { # [Bug 3464428] @@ -885,11 +884,11 @@ test utf-21.5 {unicode graph char in regc_locale.c} { } {1} test utf-21.6 {TclUniCharIsGraph} { # [Bug 3464428] - string is graph \u00a0 + string is graph \xA0 } {0} test utf-21.7 {unicode graph char in regc_locale.c} { # [Bug 3464428] - regexp {[[:graph:]]} \u0020\u00a0\u2028\u2029 + regexp {[[:graph:]]} \x20\xA0\u2028\u2029 } {0} test utf-21.8 {TclUniCharIsPrint} { # [Bug 3464428] @@ -905,49 +904,47 @@ test utf-21.10 {unicode print char in regc_locale.c} { } {0} test utf-21.11 {TclUniCharIsControl} { # [Bug 3464428] - string is control \u0000\u001f\u00ad\u0605\u061c\u180e\u2066\ufeff + string is control \x00\x1F\xad\u0605\u061C\u180E\u2066\uFEFF } {1} test utf-21.12 {unicode control char in regc_locale.c} { # [Bug 3464428], [Bug a876646efe] - regexp {^[[:cntrl:]]*$} \u0000\u001f\u00ad\u0605\u061c\u180e\u2066\ufeff + regexp {^[[:cntrl:]]*$} \x00\x1F\xad\u0605\u061C\u180E\u2066\uFEFF } {1} test utf-22.1 {TclUniCharIsWordChar} { string wordend "xyz123_bar fg" 0 } 10 test utf-22.2 {TclUniCharIsWordChar} { - string wordend "x\u5080z123_bar\u203c fg" 0 + string wordend "x\u5080z123_bar\u203C fg" 0 } 10 test utf-23.1 {TclUniCharIsAlpha} { # this returns 1 with Unicode 7 compliance - string is alpha \u021f\u0220\u037f\u052f + string is alpha \u021F\u0220\u037F\u052F } {1} test utf-23.2 {unicode alpha char in regc_locale.c} { # this returns 1 with Unicode 7 compliance - regexp {^[[:alpha:]]+$} \u021f\u0220\u037f\u052f + regexp {^[[:alpha:]]+$} \u021F\u0220\u037F\u052F } {1} test utf-24.1 {TclUniCharIsDigit} { # this returns 1 with Unicode 7 compliance - string is digit \u1040\uabf0 + string is digit \u1040\uABF0 } {1} test utf-24.2 {unicode digit char in regc_locale.c} { # this returns 1 with Unicode 7 compliance - list [regexp {^[[:digit:]]+$} \u1040\uabf0] [regexp {^\d+$} \u1040\uabf0] + list [regexp {^[[:digit:]]+$} \u1040\uABF0] [regexp {^\d+$} \u1040\uABF0] } {1 1} test utf-24.3 {TclUniCharIsSpace} { # this returns 1 with Unicode 7 compliance - string is space \u1680\u180e\u202f + string is space \u1680\u180E\u202F } {1} test utf-24.4 {unicode space char in regc_locale.c} { # this returns 1 with Unicode 7 compliance - list [regexp {^[[:space:]]+$} \u1680\u180e\u202f] [regexp {^\s+$} \u1680\u180e\u202f] + list [regexp {^[[:space:]]+$} \u1680\u180E\u202F] [regexp {^\s+$} \u1680\u180E\u202F] } {1 1} -testConstraint teststringobj [llength [info commands teststringobj]] - test utf-25.1 {Tcl_UniCharNcasecmp} -constraints teststringobj \ -setup { testobj freeallvars diff --git a/tests/util.test b/tests/util.test index 85c06dd..a483de1 100644 --- a/tests/util.test +++ b/tests/util.test @@ -15,6 +15,7 @@ if {[lsearch [namespace children] ::tcltest] == -1} { testConstraint testdstring [llength [info commands testdstring]] testConstraint testconcatobj [llength [info commands testconcatobj]] testConstraint testdoubledigits [llength [info commands testdoubledigits]] +testConstraint compat85 [expr {[format %c 0x010000] == "\uFFFD"}] # Big test for correct ordering of data in [expr] -- cgit v0.12 From ea9467702aecb854ba8cd803edbb38c4590aa928 Mon Sep 17 00:00:00 2001 From: dgp Date: Sat, 18 Apr 2020 15:02:51 +0000 Subject: Make TCL_UTF_MAX=4 build test clean again. --- generic/tcl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generic/tcl.h b/generic/tcl.h index d7d064c..7378a8f 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -2148,7 +2148,7 @@ typedef struct Tcl_Parse { * reflected in regcustom.h. */ -#if TCL_UTF_MAX > 3 +#if TCL_UTF_MAX > 4 /* * unsigned int isn't 100% accurate as it should be a strict 4-byte value * (perhaps wchar_t). 64-bit systems may have troubles. The size of this -- cgit v0.12 From f866e98a39dc53d4864e3b04119b7dc2fd65078d Mon Sep 17 00:00:00 2001 From: dgp Date: Sat, 18 Apr 2020 15:11:22 +0000 Subject: regexp engine has to agree about the sizeof(Tcl_UniChar). --- generic/regcustom.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generic/regcustom.h b/generic/regcustom.h index ac33087..57a2d47 100644 --- a/generic/regcustom.h +++ b/generic/regcustom.h @@ -97,7 +97,7 @@ typedef int celt; /* Type to hold chr, or NOCELT */ #define NOCELT (-1) /* Celt value which is not valid chr */ #define CHR(c) (UCHAR(c)) /* Turn char literal into chr literal */ #define DIGITVAL(c) ((c)-'0') /* Turn chr digit into its value */ -#if TCL_UTF_MAX > 3 +#if TCL_UTF_MAX > 4 #define CHRBITS 32 /* Bits in a chr; must not use sizeof */ #define CHR_MIN 0x00000000 /* Smallest and largest chr; the value */ #define CHR_MAX 0xffffffff /* CHR_MAX-CHR_MIN+1 should fit in uchr */ -- cgit v0.12 From 4f3cc7f661e8ae301fd9b4aaf7a4c66d94897ec3 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sat, 18 Apr 2020 19:54:08 +0000 Subject: Clean-up testcases: Constant use of uppercase in hex-values. Use "testbytestring" in stead of "bytestring". Mark tests not working with TCL_UTF_MAX>3 with "compat85" --- tests/utf.test | 278 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 137 insertions(+), 141 deletions(-) diff --git a/tests/utf.test b/tests/utf.test index 01e0bb2..189b85d 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -13,54 +13,60 @@ if {[lsearch [namespace children] ::tcltest] == -1} { namespace import -force ::tcltest::* } +testConstraint compat85 [expr {[format %c 0x010000] == "\uFFFD"}] testConstraint testbytestring [llength [info commands testbytestring]] +testConstraint testfindfirst [llength [info commands testfindfirst]] +testConstraint testfindlast [llength [info commands testfindlast]] +testConstraint testnumutfchars [llength [info commands testnumutfchars]] +testConstraint teststringobj [llength [info commands teststringobj]] +testConstraint testutfnext [llength [info commands testutfnext]] +testConstraint testutfprev [llength [info commands testutfprev]] catch {unset x} -test utf-1.1 {Tcl_UniCharToUtf: 1 byte sequences} { - set x \x01 -} [bytestring "\x01"] -test utf-1.2 {Tcl_UniCharToUtf: 2 byte sequences} { - set x "\x00" -} [bytestring "\xc0\x80"] -test utf-1.3 {Tcl_UniCharToUtf: 2 byte sequences} { - set x "\xe0" -} [bytestring "\xc3\xa0"] -test utf-1.4 {Tcl_UniCharToUtf: 3 byte sequences} { - set x "\u4e4e" -} [bytestring "\xe4\xb9\x8e"] -test utf-1.5 {Tcl_UniCharToUtf: overflowed Tcl_UniChar} { - format %c 0x110000 -} [bytestring "\xef\xbf\xbd"] -test utf-1.6 {Tcl_UniCharToUtf: negative Tcl_UniChar} { - format %c -1 -} [bytestring "\xef\xbf\xbd"] - +test utf-1.1 {Tcl_UniCharToUtf: 1 byte sequences} testbytestring { + expr {"\x01" eq [testbytestring "\x01"]} +} 1 +test utf-1.2 {Tcl_UniCharToUtf: 2 byte sequences} testbytestring { + expr {"\x00" eq [testbytestring "\xC0\x80"]} +} 1 +test utf-1.3 {Tcl_UniCharToUtf: 2 byte sequences} testbytestring { + expr {"\xE0" eq [testbytestring "\xC3\xA0"]} +} 1 +test utf-1.4 {Tcl_UniCharToUtf: 3 byte sequences} testbytestring { + expr {"\u4E4E" eq [testbytestring "\xE4\xB9\x8E"]} +} 1 +test utf-1.5 {Tcl_UniCharToUtf: overflowed Tcl_UniChar} testbytestring { + expr {[format %c 0x110000] eq [testbytestring "\xEF\xBF\xBD"]} +} 1 +test utf-1.6 {Tcl_UniCharToUtf: negative Tcl_UniChar} testbytestring { + expr {[format %c -1] eq [testbytestring "\xEF\xBF\xBD"]} +} 1 test utf-2.1 {Tcl_UtfToUniChar: low ascii} { string length "abc" } {3} -test utf-2.2 {Tcl_UtfToUniChar: naked trail bytes} { - string length [bytestring "\x82\x83\x84"] +test utf-2.2 {Tcl_UtfToUniChar: naked trail bytes} testbytestring { + string length [testbytestring "\x82\x83\x84"] } {3} -test utf-2.3 {Tcl_UtfToUniChar: lead (2-byte) followed by non-trail} { - string length [bytestring "\xC2"] +test utf-2.3 {Tcl_UtfToUniChar: lead (2-byte) followed by non-trail} testbytestring { + string length [testbytestring "\xC2"] } {1} -test utf-2.4 {Tcl_UtfToUniChar: lead (2-byte) followed by trail} { - string length [bytestring "\xC2\xa2"] +test utf-2.4 {Tcl_UtfToUniChar: lead (2-byte) followed by trail} testbytestring { + string length [testbytestring "\xC2\xA2"] } {1} -test utf-2.5 {Tcl_UtfToUniChar: lead (3-byte) followed by non-trail} { - string length [bytestring "\xE2"] +test utf-2.5 {Tcl_UtfToUniChar: lead (3-byte) followed by non-trail} testbytestring { + string length [testbytestring "\xE2"] } {1} -test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} { - string length [bytestring "\xE2\xA2"] +test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} testbytestring { + string length [testbytestring "\xE2\xA2"] } {2} -test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} { - string length [bytestring "\xE4\xb9\x8e"] +test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} testbytestring { + string length [testbytestring "\xE4\xB9\x8E"] } {1} -test utf-2.8 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring} -body { +test utf-2.8 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring compat85} -body { string length [testbytestring "\xF0\x90\x80\x80"] } -result {4} -test utf-2.9 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring} -body { +test utf-2.9 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring compat85} -body { string length [testbytestring "\xF4\x8F\xBF\xBF"] } -result {4} test utf-2.10 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, underflow} testbytestring { @@ -77,57 +83,51 @@ test utf-2.12 {Tcl_UtfToUniChar: longer UTF sequences not supported} testbytestr test utf-3.1 {Tcl_UtfCharComplete} { } {} -testConstraint testnumutfchars [llength [info commands testnumutfchars]] -testConstraint testfindfirst [llength [info commands testfindfirst]] -testConstraint testfindlast [llength [info commands testfindlast]] - test utf-4.1 {Tcl_NumUtfChars: zero length} testnumutfchars { testnumutfchars "" } {0} -test utf-4.2 {Tcl_NumUtfChars: length 1} testnumutfchars { - testnumutfchars [bytestring "\xC2\xA2"] +test utf-4.2 {Tcl_NumUtfChars: length 1} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\xC2\xA2"] } {1} -test utf-4.3 {Tcl_NumUtfChars: long string} testnumutfchars { - testnumutfchars [bytestring "abc\xC2\xA2\xe4\xb9\x8e\uA2\u4e4e"] +test utf-4.3 {Tcl_NumUtfChars: long string} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "abc\xC2\xA2\xE4\xB9\x8E\uA2\x4E"] } {7} -test utf-4.4 {Tcl_NumUtfChars: #u0000} testnumutfchars { - testnumutfchars [bytestring "\xC0\x80"] +test utf-4.4 {Tcl_NumUtfChars: #u0000} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\xC0\x80"] } {1} test utf-4.5 {Tcl_NumUtfChars: zero length, calc len} testnumutfchars { testnumutfchars "" 0 } {0} -test utf-4.6 {Tcl_NumUtfChars: length 1, calc len} testnumutfchars { - testnumutfchars [bytestring "\xC2\xA2"] 1 +test utf-4.6 {Tcl_NumUtfChars: length 1, calc len} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\xC2\xA2"] 1 } {1} -test utf-4.7 {Tcl_NumUtfChars: long string, calc len} testnumutfchars { - testnumutfchars [bytestring "abc\xC2\xA2\xe4\xb9\x8e\uA2\u4e4e"] 10 +test utf-4.7 {Tcl_NumUtfChars: long string, calc len} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "abc\xC2\xA2\xE4\xB9\x8E\uA2\x4E"] 10 } {7} -test utf-4.8 {Tcl_NumUtfChars: #u0000, calc len} testnumutfchars { - testnumutfchars [bytestring "\xC0\x80"] 1 +test utf-4.8 {Tcl_NumUtfChars: #u0000, calc len} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\xC0\x80"] 1 } {1} # Bug [2738427]: Tcl_NumUtfChars(...) no overflow check -test utf-4.9 {Tcl_NumUtfChars: #u20AC, calc len, incomplete} testnumutfchars { - testnumutfchars [bytestring "\xE2\x82\xAC"] 2 +test utf-4.9 {Tcl_NumUtfChars: #u20AC, calc len, incomplete} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\xE2\x82\xAC"] 2 } {2} -test utf-4.10 {Tcl_NumUtfChars: #u0000, calc len, overcomplete} testnumutfchars { - testnumutfchars [bytestring "\x00"] 2 +test utf-4.10 {Tcl_NumUtfChars: #u0000, calc len, overcomplete} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring "\x00"] 2 } {2} -test utf-4.11 {Tcl_NumUtfChars: 3 bytes of 4-byte UTF-8 characater} testnumutfchars { - testnumutfchars [bytestring \xf0\x9f\x92\xa9] 3 +test utf-4.11 {Tcl_NumUtfChars: 3 bytes of 4-byte UTF-8 characater} {testnumutfchars testbytestring} { + testnumutfchars [testbytestring \xF0\x9F\x92\xA9] 3 } {3} -test utf-4.12 {Tcl_NumUtfChars: #4-byte UTF-8 character} testnumutfchars { - testnumutfchars [bytestring \xf0\x9f\x92\xa9] 4 +test utf-4.12 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring compat85} { + testnumutfchars [testbytestring \xF0\x9F\x92\xA9] 4 } {4} -test utf-5.1 {Tcl_UtfFindFirst} testfindfirst { - testfindfirst [bytestring "abcbc"] 98 +test utf-5.1 {Tcl_UtfFindFirst} {testfindfirst testbytestring} { + testfindfirst [testbytestring "abcbc"] 98 } {bcbc} -test utf-5.2 {Tcl_UtfFindLast} testfindlast { - testfindlast [bytestring "abcbc"] 98 +test utf-5.2 {Tcl_UtfFindLast} {testfindlast testbytestring} { + testfindlast [testbytestring "abcbc"] 98 } {bc} -testConstraint testutfnext [llength [info commands testutfnext]] - test utf-6.1 {Tcl_UtfNext} testutfnext { # This takes the pointer one past the terminating NUL. # This is really an invalid call. @@ -334,7 +334,7 @@ test utf-6.67 {Tcl_UtfNext} testutfnext { test utf-6.68 {Tcl_UtfNext} testutfnext { testutfnext \xF2\xA0\xA0G } 1 -test utf-6.69 {Tcl_UtfNext} testutfnext { +test utf-6.69 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0 } 1 test utf-6.70 {Tcl_UtfNext} testutfnext { @@ -349,22 +349,22 @@ test utf-6.71 {Tcl_UtfNext} testutfnext { test utf-6.73 {Tcl_UtfNext} testutfnext { testutfnext \xF2\xA0\xA0\xF8 } 1 -test utf-6.74 {Tcl_UtfNext} testutfnext { +test utf-6.74 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0G } 1 -test utf-6.75 {Tcl_UtfNext} testutfnext { +test utf-6.75 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0\xA0 } 1 -test utf-6.76 {Tcl_UtfNext} testutfnext { +test utf-6.76 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0\xD0 } 1 -test utf-6.77 {Tcl_UtfNext} testutfnext { +test utf-6.77 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0\xE8 } 1 -test utf-6.78 {Tcl_UtfNext} testutfnext { +test utf-6.78 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0\xF2 } 1 -test utf-6.79 {Tcl_UtfNext} testutfnext { +test utf-6.79 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0G\xF8 } 1 test utf-6.80 {Tcl_UtfNext - overlong sequences} testutfnext { @@ -388,7 +388,7 @@ test utf-6.85 {Tcl_UtfNext - overlong sequences} testutfnext { test utf-6.86 {Tcl_UtfNext - overlong sequences} testutfnext { testutfnext \xF0\x80\x80\x80 } 1 -test utf-6.87 {Tcl_UtfNext - overlong sequences} testutfnext { +test utf-6.87 {Tcl_UtfNext - overlong sequences} {testutfnext compat85} { testutfnext \xF0\x90\x80\x80 } 1 test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} { @@ -403,15 +403,13 @@ test utf-6.89 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {te test utf-6.89.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext} { testutfnext \xF0\x80\x80 1 } 2 -test utf-6.90 {Tcl_UtfNext, validity check [493dccc2de]} testutfnext { +test utf-6.90 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext compat85} { testutfnext \xF4\x8F\xBF\xBF } 1 test utf-6.91 {Tcl_UtfNext, validity check [493dccc2de]} testutfnext { testutfnext \xF4\x90\x80\x80 } 1 -testConstraint testutfprev [llength [info commands testutfprev]] - test utf-7.1 {Tcl_UtfPrev} testutfprev { testutfprev {} } 0 @@ -475,13 +473,13 @@ test utf-7.9.1 {Tcl_UtfPrev} testutfprev { test utf-7.9.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xF8\xA0 3 } 2 -test utf-7.10 {Tcl_UtfPrev} testutfprev { +test utf-7.10 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0 } 2 -test utf-7.10.1 {Tcl_UtfPrev} testutfprev { +test utf-7.10.1 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0\xA0 3 } 2 -test utf-7.10.2 {Tcl_UtfPrev} testutfprev { +test utf-7.10.2 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xF8\xA0 3 } 2 test utf-7.11 {Tcl_UtfPrev} testutfprev { @@ -523,13 +521,13 @@ test utf-7.14.1 {Tcl_UtfPrev} testutfprev { test utf-7.14.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xF8 4 } 3 -test utf-7.15 {Tcl_UtfPrev} testutfprev { +test utf-7.15 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0 } 3 -test utf-7.15.1 {Tcl_UtfPrev} testutfprev { +test utf-7.15.1 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0\xA0 4 } 3 -test utf-7.15.2 {Tcl_UtfPrev} testutfprev { +test utf-7.15.2 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0\xF8 4 } 3 test utf-7.16 {Tcl_UtfPrev} testutfprev { @@ -562,7 +560,7 @@ test utf-7.18.2 {Tcl_UtfPrev} testutfprev { test utf-7.19 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xA0 } 4 -test utf-7.20 {Tcl_UtfPrev} testutfprev { +test utf-7.20 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0\xA0 } 4 test utf-7.21 {Tcl_UtfPrev} testutfprev { @@ -622,16 +620,16 @@ test utf-7.36 {Tcl_UtfPrev -- overlong sequence} testutfprev { test utf-7.37 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xE0\xA0\x80 3 } 1 -test utf-7.38 {Tcl_UtfPrev -- overlong sequence} testutfprev { +test utf-7.38 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { testutfprev A\xE0\xA0\x80 2 } 1 -test utf-7.39 {Tcl_UtfPrev -- overlong sequence} testutfprev { +test utf-7.39 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { testutfprev A\xF0\x90\x80\x80 } 4 -test utf-7.40 {Tcl_UtfPrev -- overlong sequence} testutfprev { +test utf-7.40 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { testutfprev A\xF0\x90\x80\x80 4 } 3 -test utf-7.41 {Tcl_UtfPrev -- overlong sequence} testutfprev { +test utf-7.41 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { testutfprev A\xF0\x90\x80\x80 3 } 2 test utf-7.42 {Tcl_UtfPrev -- overlong sequence} testutfprev { @@ -658,13 +656,13 @@ test utf-7.47.1 {Tcl_UtfPrev, pointing to 3th byte of 3-byte valid sequence} {te test utf-7.47.2 {Tcl_UtfPrev, pointing to 3th byte of 3-byte invalid sequence} {testutfprev} { testutfprev \xE8\xA0\x00 2 } 0 -test utf-7.48 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { +test utf-7.48 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { testutfprev A\xF4\x8F\xBF\xBF } 4 -test utf-7.48.1 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { +test utf-7.48.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { testutfprev A\xF4\x8F\xBF\xBF 4 } 3 -test utf-7.48.2 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { +test utf-7.48.2 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { testutfprev A\xF4\x8F\xBF\xBF 3 } 2 test utf-7.48.3 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { @@ -708,18 +706,18 @@ test utf-10.1 {Tcl_UtfBackslash: dst == NULL} { set x \n } { } -test utf-10.2 {Tcl_UtfBackslash: \u subst} { - set x \ua2 -} [bytestring "\xc2\xa2"] -test utf-10.3 {Tcl_UtfBackslash: longer \u subst} { - set x \u4e21 -} [bytestring "\xe4\xb8\xa1"] -test utf-10.4 {Tcl_UtfBackslash: stops at first non-hex} { - set x \u4e2k -} "[bytestring \xd3\xa2]k" -test utf-10.5 {Tcl_UtfBackslash: stops after 4 hex chars} { - set x \u4e216 -} "[bytestring \xe4\xb8\xa1]6" +test utf-10.2 {Tcl_UtfBackslash: \u subst} testbytestring { + expr {"\uA2" eq [testbytestring "\xC2\xA2"]} +} 1 +test utf-10.3 {Tcl_UtfBackslash: longer \u subst} testbytestring { + expr {"\u4E21" eq [testbytestring "\xE4\xB8\xA1"]} +} 1 +test utf-10.4 {Tcl_UtfBackslash: stops at first non-hex} testbytestring { + expr {"\u4E2k" eq "[testbytestring \xD3\xA2]k"} +} 1 +test utf-10.5 {Tcl_UtfBackslash: stops after 4 hex chars} testbytestring { + expr {"\u4E216" eq [testbytestring "\xE4\xB8\xA1"]6} +} 1 proc bsCheck {char num} { global errNum test utf-10.$errNum {backslash substitution} { @@ -774,11 +772,11 @@ test utf-11.2 {Tcl_UtfToUpper} { string toupper abc } ABC test utf-11.3 {Tcl_UtfToUpper} { - string toupper \u00e3ab -} \u00c3AB + string toupper \xE3gh +} \xC3GH test utf-11.4 {Tcl_UtfToUpper} { - string toupper \u01e3ab -} \u01e2AB + string toupper \u01E3ab +} \u01E2AB test utf-12.1 {Tcl_UtfToLower} { string tolower {} @@ -787,11 +785,11 @@ test utf-12.2 {Tcl_UtfToLower} { string tolower ABC } abc test utf-12.3 {Tcl_UtfToLower} { - string tolower \u00c3AB -} \u00e3ab + string tolower \xC3GH +} \xE3gh test utf-12.4 {Tcl_UtfToLower} { - string tolower \u01e2AB -} \u01e3ab + string tolower \u01E2AB +} \u01E3ab test utf-13.1 {Tcl_UtfToTitle} { string totitle {} @@ -800,11 +798,11 @@ test utf-13.2 {Tcl_UtfToTitle} { string totitle abc } Abc test utf-13.3 {Tcl_UtfToTitle} { - string totitle \u00e3ab -} \u00c3ab + string totitle \xE3GH +} \xC3gh test utf-13.4 {Tcl_UtfToTitle} { - string totitle \u01f3ab -} \u01f2ab + string totitle \u01F3AB +} \u01F2ab test utf-14.1 {Tcl_UtfNcasecmp} { string compare -nocase a b @@ -823,7 +821,7 @@ test utf-15.1 {Tcl_UniCharToUpper, negative delta} { string toupper aA } AA test utf-15.2 {Tcl_UniCharToUpper, positive delta} { - string toupper \u0178\u00ff + string toupper \u0178\xFF } \u0178\u0178 test utf-15.3 {Tcl_UniCharToUpper, no delta} { string toupper ! @@ -833,24 +831,24 @@ test utf-16.1 {Tcl_UniCharToLower, negative delta} { string tolower aA } aa test utf-16.2 {Tcl_UniCharToLower, positive delta} { - string tolower \u0178\u00ff\uA78D\u01c5 -} \u00ff\u00ff\u0265\u01c6 + string tolower \u0178\xFF\uA78D\u01C5 +} \xFF\xFF\u0265\u01C6 test utf-17.1 {Tcl_UniCharToLower, no delta} { string tolower ! } ! test utf-18.1 {Tcl_UniCharToTitle, add one for title} { - string totitle \u01c4 -} \u01c5 + string totitle \u01C4 +} \u01C5 test utf-18.2 {Tcl_UniCharToTitle, subtract one for title} { - string totitle \u01c6 -} \u01c5 + string totitle \u01C6 +} \u01C5 test utf-18.3 {Tcl_UniCharToTitle, subtract delta for title (positive)} { - string totitle \u017f -} \u0053 + string totitle \u017F +} \x53 test utf-18.4 {Tcl_UniCharToTitle, subtract delta for title (negative)} { - string totitle \u00ff + string totitle \xFF } \u0178 test utf-18.5 {Tcl_UniCharToTitle, no delta} { string totitle ! @@ -865,15 +863,15 @@ test utf-20.1 {TclUniCharNcmp} { test utf-21.1 {TclUniCharIsAlnum} { # this returns 1 with Unicode 7 compliance - string is alnum \u1040\u021f\u0220 + string is alnum \u1040\u021F\u0220 } {1} test utf-21.2 {unicode alnum char in regc_locale.c} { # this returns 1 with Unicode 7 compliance - list [regexp {^[[:alnum:]]+$} \u1040\u021f\u0220] [regexp {^\w+$} \u1040\u021f\u0220_\u203f\u2040\u2054\ufe33\ufe34\ufe4d\ufe4e\ufe4f\uff3f] + list [regexp {^[[:alnum:]]+$} \u1040\u021F\u0220] [regexp {^\w+$} \u1040\u021F\u0220_\u203F\u2040\u2054\uFE33\uFE34\uFE4D\uFE4E\uFE4F\uFF3F] } {1 1} test utf-21.3 {unicode print char in regc_locale.c} { # this returns 1 with Unicode 7 compliance - regexp {^[[:print:]]+$} \ufbc1 + regexp {^[[:print:]]+$} \uFBC1 } 1 test utf-21.4 {TclUniCharIsGraph} { # [Bug 3464428] @@ -885,69 +883,67 @@ test utf-21.5 {unicode graph char in regc_locale.c} { } {1} test utf-21.6 {TclUniCharIsGraph} { # [Bug 3464428] - string is graph \u00a0 + string is graph \xA0 } {0} test utf-21.7 {unicode graph char in regc_locale.c} { # [Bug 3464428] - regexp {[[:graph:]]} \u0020\u00a0\u2028\u2029 + regexp {[[:graph:]]} \x20\xA0\u2028\u2029 } {0} test utf-21.8 {TclUniCharIsPrint} { # [Bug 3464428] - string is print \u0009 + string is print \x09 } {0} test utf-21.9 {unicode print char in regc_locale.c} { # [Bug 3464428] - regexp {[[:print:]]} \u0009 + regexp {[[:print:]]} \x09 } {0} test utf-21.10 {unicode print char in regc_locale.c} { # [Bug 3464428] - regexp {[[:print:]]} \u0009 + regexp {[[:print:]]} \x09 } {0} test utf-21.11 {TclUniCharIsControl} { # [Bug 3464428] - string is control \u0000\u001f\u00ad\u0605\u061c\u180e\u2066\ufeff + string is control \x00\x1F\xAD\u0605\u061C\u180E\u2066\uFEFF } {1} test utf-21.12 {unicode control char in regc_locale.c} { # [Bug 3464428], [Bug a876646efe] - regexp {^[[:cntrl:]]*$} \u0000\u001f\u00ad\u0605\u061c\u180e\u2066\ufeff + regexp {^[[:cntrl:]]*$} \x00\x1F\xAD\u0605\u061C\u180E\u2066\uFEFF } {1} test utf-22.1 {TclUniCharIsWordChar} { string wordend "xyz123_bar fg" 0 } 10 test utf-22.2 {TclUniCharIsWordChar} { - string wordend "x\u5080z123_bar\u203c fg" 0 + string wordend "x\u5080z123_bar\u203C fg" 0 } 10 test utf-23.1 {TclUniCharIsAlpha} { # this returns 1 with Unicode 7 compliance - string is alpha \u021f\u0220\u037f\u052f + string is alpha \u021F\u0220\u037F\u052F } {1} test utf-23.2 {unicode alpha char in regc_locale.c} { # this returns 1 with Unicode 7 compliance - regexp {^[[:alpha:]]+$} \u021f\u0220\u037f\u052f + regexp {^[[:alpha:]]+$} \u021F\u0220\u037F\u052F } {1} test utf-24.1 {TclUniCharIsDigit} { # this returns 1 with Unicode 7 compliance - string is digit \u1040\uabf0 + string is digit \u1040\uABF0 } {1} test utf-24.2 {unicode digit char in regc_locale.c} { # this returns 1 with Unicode 7 compliance - list [regexp {^[[:digit:]]+$} \u1040\uabf0] [regexp {^\d+$} \u1040\uabf0] + list [regexp {^[[:digit:]]+$} \u1040\uABF0] [regexp {^\d+$} \u1040\uABF0] } {1 1} test utf-24.3 {TclUniCharIsSpace} { # this returns 1 with Unicode 7 compliance - string is space \u1680\u180e\u202f + string is space \u1680\u180E\u202F } {1} test utf-24.4 {unicode space char in regc_locale.c} { # this returns 1 with Unicode 7 compliance - list [regexp {^[[:space:]]+$} \u1680\u180e\u202f] [regexp {^\s+$} \u1680\u180e\u202f] + list [regexp {^[[:space:]]+$} \u1680\u180E\u202F] [regexp {^\s+$} \u1680\u180E\u202F] } {1 1} -testConstraint teststringobj [llength [info commands teststringobj]] - test utf-25.1 {Tcl_UniCharNcasecmp} -constraints teststringobj \ -setup { testobj freeallvars -- cgit v0.12 From 1a343cd043776b8acc3c4a047a10556c70f077dd Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Sun, 19 Apr 2020 19:37:05 +0000 Subject: More test-cases. Fix wrong quoting in testcase utf-10.5 --- tests/utf.test | 52 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/tests/utf.test b/tests/utf.test index 189b85d..946aa83 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -42,6 +42,7 @@ test utf-1.5 {Tcl_UniCharToUtf: overflowed Tcl_UniChar} testbytestring { test utf-1.6 {Tcl_UniCharToUtf: negative Tcl_UniChar} testbytestring { expr {[format %c -1] eq [testbytestring "\xEF\xBF\xBD"]} } 1 + test utf-2.1 {Tcl_UtfToUniChar: low ascii} { string length "abc" } {3} @@ -90,7 +91,7 @@ test utf-4.2 {Tcl_NumUtfChars: length 1} {testnumutfchars testbytestring} { testnumutfchars [testbytestring "\xC2\xA2"] } {1} test utf-4.3 {Tcl_NumUtfChars: long string} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring "abc\xC2\xA2\xE4\xB9\x8E\uA2\x4E"] + testnumutfchars [testbytestring "abc\xC2\xA2\xE4\xB9\x8E\xA2\x4E"] } {7} test utf-4.4 {Tcl_NumUtfChars: #u0000} {testnumutfchars testbytestring} { testnumutfchars [testbytestring "\xC0\x80"] @@ -693,6 +694,18 @@ test utf-8.3 {Tcl_UniCharAtIndex: index > 0} { test utf-8.4 {Tcl_UniCharAtIndex: index > 0} { string index \u4E4E\u25A\xFF\u543 2 } "\uFF" +test utf-8.5 {Tcl_UniCharAtIndex: high surrogate} { + string index \uD842 0 +} "\uD842" +test utf-8.6 {Tcl_UniCharAtIndex: low surrogate} { + string index \uDC42 0 +} "\uDC42" +test utf-8.7 {Tcl_UniCharAtIndex: Emoji} compat85 { + string index \uD83D\uDE00 0 +} "\uD83D" +test utf-8.8 {Tcl_UniCharAtIndex: Emoji} compat85 { + string index \uD83D\uDE00 1 +} "\uDE00" test utf-9.1 {Tcl_UtfAtIndex: index = 0} { string range abcd 0 2 @@ -700,6 +713,12 @@ test utf-9.1 {Tcl_UtfAtIndex: index = 0} { test utf-9.2 {Tcl_UtfAtIndex: index > 0} { string range \u4E4E\u25A\xFF\u543klmnop 1 5 } "\u25A\xFF\u543kl" +test utf-9.3 {Tcl_UtfAtIndex: index = 0, Emoji} compat85 { + string range \uD83D\uDE00G 0 0 +} "\uD83D" +test utf-9.4 {Tcl_UtfAtIndex: index > 0, Emoji} compat85 { + string range \uD83D\uDE00G 1 1 +} "\uDE00" test utf-10.1 {Tcl_UtfBackslash: dst == NULL} { @@ -716,7 +735,7 @@ test utf-10.4 {Tcl_UtfBackslash: stops at first non-hex} testbytestring { expr {"\u4E2k" eq "[testbytestring \xD3\xA2]k"} } 1 test utf-10.5 {Tcl_UtfBackslash: stops after 4 hex chars} testbytestring { - expr {"\u4E216" eq [testbytestring "\xE4\xB8\xA1"]6} + expr {"\u4E216" eq "[testbytestring \xE4\xB8\xA1]6"} } 1 proc bsCheck {char num} { global errNum @@ -775,8 +794,8 @@ test utf-11.3 {Tcl_UtfToUpper} { string toupper \xE3gh } \xC3GH test utf-11.4 {Tcl_UtfToUpper} { - string toupper \u01E3ab -} \u01E2AB + string toupper \u01E3gh +} \u01E2GH test utf-12.1 {Tcl_UtfToLower} { string tolower {} @@ -788,8 +807,14 @@ test utf-12.3 {Tcl_UtfToLower} { string tolower \xC3GH } \xE3gh test utf-12.4 {Tcl_UtfToLower} { - string tolower \u01E2AB -} \u01E3ab + string tolower \u01E2GH +} \u01E3gh +test utf-12.5 {Tcl_UtfToLower Georgian (new in Unicode 11)} { + string tolower \u10D0\u1C90 +} \u10D0\u10D0 +test utf-12.6 {Tcl_UtfToUpper low/high surrogate)} { + string tolower \uDC24\uD824 +} \uDC24\uD824 test utf-13.1 {Tcl_UtfToTitle} { string totitle {} @@ -803,6 +828,15 @@ test utf-13.3 {Tcl_UtfToTitle} { test utf-13.4 {Tcl_UtfToTitle} { string totitle \u01F3AB } \u01F2ab +test utf-13.5 {Tcl_UtfToTitle Georgian (new in Unicode 11)} { + string totitle \u10D0\u1C90 +} \u10D0\u1C90 +test utf-13.6 {Tcl_UtfToTitle Georgian (new in Unicode 11)} { + string totitle \u1C90\u10D0 +} \u1C90\u10D0 +test utf-13.7 {Tcl_UtfToTitle low/high surrogate)} { + string totitle \uDC24\uD824 +} \uDC24\uD824 test utf-14.1 {Tcl_UtfNcasecmp} { string compare -nocase a b @@ -854,9 +888,11 @@ test utf-18.5 {Tcl_UniCharToTitle, no delta} { string totitle ! } ! -test utf-19.1 {TclUniCharLen} { +test utf-19.1 {TclUniCharLen} -body { list [regexp \\d abc456def foo] $foo -} {1 4} +} -cleanup { + unset -nocomplain foo +} -result {1 4} test utf-20.1 {TclUniCharNcmp} { } {} -- cgit v0.12 From 9bcd48e5ffd32d1858e4d0f90a4eaee550ede17f Mon Sep 17 00:00:00 2001 From: dgp Date: Sun, 19 Apr 2020 22:02:10 +0000 Subject: typo --- tests/utf.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utf.test b/tests/utf.test index 946aa83..07863b9 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -103,7 +103,7 @@ test utf-4.6 {Tcl_NumUtfChars: length 1, calc len} {testnumutfchars testbytestri testnumutfchars [testbytestring "\xC2\xA2"] 1 } {1} test utf-4.7 {Tcl_NumUtfChars: long string, calc len} {testnumutfchars testbytestring} { - testnumutfchars [testbytestring "abc\xC2\xA2\xE4\xB9\x8E\uA2\x4E"] 10 + testnumutfchars [testbytestring "abc\xC2\xA2\xE4\xB9\x8E\xA2\x4E"] 10 } {7} test utf-4.8 {Tcl_NumUtfChars: #u0000, calc len} {testnumutfchars testbytestring} { testnumutfchars [testbytestring "\xC0\x80"] 1 -- cgit v0.12 From bb5381f946565a91e146910d62c56b40c02c5193 Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 20 Apr 2020 05:35:54 +0000 Subject: Reconcile tests to the 8.5 branch history. --- tests/utf.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utf.test b/tests/utf.test index 07863b9..1ca3647 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -621,7 +621,7 @@ test utf-7.36 {Tcl_UtfPrev -- overlong sequence} testutfprev { test utf-7.37 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xE0\xA0\x80 3 } 1 -test utf-7.38 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { +test utf-7.38 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xE0\xA0\x80 2 } 1 test utf-7.39 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { -- cgit v0.12 From 534db753aefcbe8cbdbec69611e9c6e31ea3deec Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 20 Apr 2020 06:45:47 +0000 Subject: Backport the encoding fix for source-7.2 in TCL_UTF_MAX=6 build. --- generic/tclEncoding.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 6c16827..5a9d2d5 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2470,20 +2470,33 @@ UtfToUnicodeProc( if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; break; - } + } src += TclUtfToUniChar(src, &ch); /* * Need to handle this in a way that won't cause misalignment * by casting dst to a Tcl_UniChar. [Bug 1122671] - * XXX: This hard-codes the assumed size of Tcl_UniChar as 2. */ #ifdef WORDS_BIGENDIAN +#if TCL_UTF_MAX > 4 + *dst++ = (ch >> 24); + *dst++ = ((ch >> 16) & 0xFF); + *dst++ = ((ch >> 8) & 0xFF); + *dst++ = (ch & 0xFF); +#else *dst++ = (ch >> 8); *dst++ = (ch & 0xFF); +#endif +#else +#if TCL_UTF_MAX > 4 + *dst++ = (ch & 0xFF); + *dst++ = ((ch >> 8) & 0xFF); + *dst++ = ((ch >> 16) & 0xFF); + *dst++ = (ch >> 24); #else *dst++ = (ch & 0xFF); *dst++ = (ch >> 8); #endif +#endif } *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; -- cgit v0.12 From 0424b820bc8101075ba4673a8d07df870348f134 Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 20 Apr 2020 07:34:28 +0000 Subject: Backport the fix for encoding-16.1 in a TCL_UTF_MAX=6 build. --- generic/tclEncoding.c | 240 +++++++++++++++++++++++++++----------------------- 1 file changed, 128 insertions(+), 112 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 5a9d2d5..da03055 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -83,7 +83,7 @@ typedef struct TableEncodingData { } TableEncodingData; /* - * The following structures is the clientData for a dynamically-loaded, + * Each of the following structures is the clientData for a dynamically-loaded * escape-driven encoding that is itself comprised of other simpler encodings. * An example is "iso-2022-jp", which uses escape sequences to switch between * ascii, jis0208, jis0212, gb2312, and ksc5601. Note that "escape-driven" @@ -117,8 +117,8 @@ typedef struct EscapeEncodingData { * 0. */ int numSubTables; /* Length of following array. */ EscapeSubTable subTables[1];/* Information about each EscapeSubTable used - * by this encoding type. The actual size will - * be as large as necessary to hold all + * by this encoding type. The actual size is + * as large as necessary to hold all * EscapeSubTables. */ } EscapeEncodingData; @@ -156,7 +156,7 @@ static ProcessGlobalValue encodingFileMap = { * A list of directories making up the "library path". Historically this * search path has served many uses, but the only one remaining is a base for * the encodingSearchPath above. If the application does not explicitly set - * the encodingSearchPath, then it will be initialized by appending /encoding + * the encodingSearchPath, then it is initialized by appending /encoding * to each directory in this "libraryPath". */ @@ -177,7 +177,7 @@ TCL_DECLARE_MUTEX(encodingMutex) /* * The following are used to hold the default and current system encodings. * If NULL is passed to one of the conversion routines, the current setting of - * the system encoding will be used to perform the conversion. + * the system encoding is used to perform the conversion. */ static Tcl_Encoding defaultEncoding; @@ -429,9 +429,8 @@ TclGetLibraryPath(void) * Keeps the per-thread copy of the library path current with changes to * the global copy. * - * NOTE: this routine returns void, so there's no way to report the error - * that searchPath is not a valid list. In that case, this routine will - * silently do nothing. + * Since the result of this routine is void, if searchPath is not a valid + * list this routine silently does nothing. * *---------------------------------------------------------------------- */ @@ -453,17 +452,16 @@ TclSetLibraryPath( * * FillEncodingFileMap -- * - * Called to bring the encoding file map in sync with the current value - * of the encoding search path. + * Called to update the encoding file map with the current value + * of the encoding search path. * - * Scan the directories on the encoding search path, find the *.enc - * files, and store the found pathnames in a map associated with the - * encoding name. + * Finds *.end files in the directories on the encoding search path and + * stores the found pathnames in a map associated with the encoding name. * - * In particular, if $dir is on the encoding search path, and the file - * $dir/foo.enc is found, then store a "foo" -> $dir entry in the map. - * Later, any need for the "foo" encoding will quickly * be able to - * construct the $dir/foo.enc pathname for reading the encoding data. + * If $dir is on the encoding search path and the file $dir/foo.enc is + * found, stores a "foo" -> $dir entry in the map. if the "foo" encoding + * is needed later, the $dir/foo.enc name can be quickly constructed in + * order to read the encoding data. * * Results: * None. @@ -544,19 +542,24 @@ void TclInitEncodingSubsystem(void) { Tcl_EncodingType type; + union { + char c; + short s; + } isLe; if (encodingsInitialized) { return; } + isLe.s = 1; Tcl_MutexLock(&encodingMutex); Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS); Tcl_MutexUnlock(&encodingMutex); /* - * Create a few initial encodings. Note that the UTF-8 to UTF-8 - * translation is not a no-op, because it will turn a stream of improperly - * formed UTF-8 into a properly formed stream. + * Create a few initial encodings. UTF-8 to UTF-8 translation is not a + * no-op because it turns a stream of improperly formed UTF-8 into a + * properly formed stream. */ type.encodingName = "identity"; @@ -583,7 +586,7 @@ TclInitEncodingSubsystem(void) type.fromUtfProc = UtfToUnicodeProc; type.freeProc = NULL; type.nullSize = 2; - type.clientData = NULL; + type.clientData = INT2PTR(isLe.c); Tcl_CreateEncoding(&type); /* @@ -755,11 +758,7 @@ Tcl_SetDefaultEncodingDir( * interp was NULL. * * Side effects: - * The new encoding type is entered into a table visible to all - * interpreters, keyed off the encoding's name. For each call to this - * function, there should eventually be a call to Tcl_FreeEncoding, so - * that the database can be cleaned up when encodings aren't needed - * anymore. + * LoadEncodingFile is called if necessary. * *------------------------------------------------------------------------- */ @@ -797,15 +796,15 @@ Tcl_GetEncoding( * * Tcl_FreeEncoding -- * - * This function is called to release an encoding allocated by - * Tcl_CreateEncoding() or Tcl_GetEncoding(). + * Releases an encoding allocated by Tcl_CreateEncoding() or + * Tcl_GetEncoding(). * * Results: * None. * * Side effects: * The reference count associated with the encoding is decremented and - * the encoding may be deleted if nothing is using it anymore. + * the encoding is deleted if nothing is using it anymore. * *--------------------------------------------------------------------------- */ @@ -824,13 +823,14 @@ Tcl_FreeEncoding( * * FreeEncoding -- * - * This function is called to release an encoding by functions that - * already have the encodingMutex. + * Decrements the reference count of an encoding. The caller must hold + * encodingMutes. * * Results: * None. * * Side effects: + * Releases the resource for an encoding if it is now unused. * The reference count associated with the encoding is decremented and * the encoding may be deleted if nothing is using it anymore. * @@ -850,16 +850,17 @@ FreeEncoding( if (encodingPtr->refCount<=0) { Tcl_Panic("FreeEncoding: refcount problem !!!"); } - encodingPtr->refCount--; - if (encodingPtr->refCount == 0) { + if (encodingPtr->refCount-- <= 1) { if (encodingPtr->freeProc != NULL) { (*encodingPtr->freeProc)(encodingPtr->clientData); } if (encodingPtr->hPtr != NULL) { Tcl_DeleteHashEntry(encodingPtr->hPtr); } - ckfree((char *) encodingPtr->name); - ckfree((char *) encodingPtr); + if (encodingPtr->name) { + ckfree((char *)encodingPtr->name); + } + ckfree((char *)encodingPtr); } } @@ -1020,23 +1021,22 @@ Tcl_SetSystemEncoding( * * Tcl_CreateEncoding -- * - * This function is called to define a new encoding and the functions - * that are used to convert between the specified encoding and Unicode. + * Defines a new encoding, along with the functions that are used to + * convert to and from Unicode. * * Results: * Returns a token that represents the encoding. If an encoding with the * same name already existed, the old encoding token remains valid and - * continues to behave as it used to, and will eventually be garbage - * collected when the last reference to it goes away. Any subsequent - * calls to Tcl_GetEncoding with the specified name will retrieve the - * most recent encoding token. + * continues to behave as it used to, and is eventually garbage collected + * when the last reference to it goes away. Any subsequent calls to + * Tcl_GetEncoding with the specified name retrieve the most recent + * encoding token. * * Side effects: - * The new encoding type is entered into a table visible to all - * interpreters, keyed off the encoding's name. For each call to this - * function, there should eventually be a call to Tcl_FreeEncoding, so - * that the database can be cleaned up when encodings aren't needed - * anymore. + * A new record having the name of the encoding is entered into a table of + * encodings visible to all interpreters. For each call to this function, + * there should eventually be a call to Tcl_FreeEncoding, which cleans + * deletes the record in the table when an encoding is no longer needed. * *--------------------------------------------------------------------------- */ @@ -1258,10 +1258,9 @@ Tcl_ExternalToUtf( * * Tcl_UtfToExternalDString -- * - * Convert a source buffer from UTF-8 into the specified encoding. If any + * Convert a source buffer from UTF-8 to the specified encoding. If any * of the bytes in the source buffer are invalid or cannot be represented - * in the target encoding, a default fallback character will be - * substituted. + * in the target encoding, a default fallback character is substituted. * * Results: * The converted bytes are stored in the DString, which is then NULL @@ -1570,13 +1569,13 @@ OpenEncodingFileChannel( * the data. * * Results: - * The return value is the newly loaded Encoding, or NULL if the file - * didn't exist of was in the incorrect format. If NULL was returned, an - * error message is left in interp's result object, unless interp was - * NULL. + * The return value is the newly loaded Tcl_Encoding or NULL if the file + * didn't exist or could not be processed. If NULL is returned and interp + * is not NULL, an error message is left in interp's result object. * * Side effects: - * File read from disk. + * A corresponding encoding file might be read from persistent storage, in + * which case LoadTableEncoding is called. * *--------------------------------------------------------------------------- */ @@ -1584,8 +1583,8 @@ OpenEncodingFileChannel( static Tcl_Encoding LoadEncodingFile( Tcl_Interp *interp, /* Interp for error reporting, if not NULL. */ - const char *name) /* The name of the encoding file on disk and - * also the name for new encoding. */ + const char *name) /* The name of both the encoding file + * and the new encoding. */ { Tcl_Channel chan = NULL; Tcl_Encoding encoding = NULL; @@ -1637,27 +1636,27 @@ LoadEncodingFile( * * LoadTableEncoding -- * - * Helper function for LoadEncodingTable(). Loads a table to that - * converts between Unicode and some other encoding and creates an - * encoding (using a TableEncoding structure) from that information. + * Helper function for LoadEncodingFile(). Creates a Tcl_EncodingType + * structure along with its corresponding TableEncodingData structure, and + * passes it to Tcl_Createncoding. * - * File contains binary data, but begins with a marker to indicate - * byte-ordering, so that same binary file can be read on either endian - * platforms. + * The file contains binary data but begins with a marker to indicate + * byte-ordering so a single binary file can be read on big or + * little-endian systems. * * Results: - * The return value is the new encoding, or NULL if the encoding could - * not be created (because the file contained invalid data). + * Returns the new Tcl_Encoding, or NULL if it could could + * not be created because the file contained invalid data. * * Side effects: - * None. + * See Tcl_CreateEncoding(). * *------------------------------------------------------------------------- */ static Tcl_Encoding LoadTableEncoding( - const char *name, /* Name for new encoding. */ + const char *name, /* Name of the new encoding. */ int type, /* Type of encoding (ENCODING_?????). */ Tcl_Channel chan) /* File containing new encoding. */ { @@ -1769,10 +1768,10 @@ LoadTableEncoding( } /* - * Invert toUnicode array to produce the fromUnicode array. Performs a + * Invert the toUnicode array to produce the fromUnicode array. Performs a * single malloc to get the memory for the array and all the pages needed - * by the array. While reading in the toUnicode array, we remembered what - * pages that would be needed for the fromUnicode array. + * by the array. While reading in the toUnicode array remember what + * pages are needed for the fromUnicode array. */ if (symbol) { @@ -1814,8 +1813,8 @@ LoadTableEncoding( if (type == ENCODING_MULTIBYTE) { /* * If multibyte encodings don't have a backslash character, define - * one. Otherwise, on Windows, native file names won't work because - * the backslash in the file name will map to the unknown character + * one. Otherwise, on Windows, native file names don't work because + * the backslash in the file name maps to the unknown character * (question mark) when converting from UTF-8 to external encoding. */ @@ -1829,13 +1828,13 @@ LoadTableEncoding( unsigned short *page; /* - * Make a special symbol encoding that not only maps the symbol - * characters from their Unicode code points down into page 0, but - * also ensure that the characters on page 0 map to themselves. This - * is so that a symbol font can be used to display a simple string - * like "abcd" and have alpha, beta, chi, delta show up, rather than - * have "unknown" chars show up because strictly speaking the symbol - * font doesn't have glyphs for those low ascii chars. + * Make a special symbol encoding that maps each symbol character from + * its Unicode code point down into page 0, and also ensure that each + * characters on page 0 maps to itself so that a symbol font can be + * used to display a simple string like "abcd" and have alpha, beta, + * chi, delta show up, rather than have "unknown" chars show up because + * strictly speaking the symbol font doesn't have glyphs for those low + * ASCII chars. */ page = dataPtr->fromUnicode[0]; @@ -1939,7 +1938,7 @@ LoadTableEncoding( static Tcl_Encoding LoadEscapeEncoding( - const char *name, /* Name for new encoding. */ + const char *name, /* Name of the new encoding. */ Tcl_Channel chan) /* File containing new encoding. */ { int i; @@ -2318,7 +2317,7 @@ UtfToUtfProc( * * UnicodeToUtfProc -- * - * Convert from Unicode to UTF-8. + * Convert from UTF-16 to UTF-8. * * Results: * Returns TCL_OK if conversion was successful. @@ -2331,7 +2330,7 @@ UtfToUtfProc( static int UnicodeToUtfProc( - ClientData clientData, /* Not used. */ + ClientData clientData, /* != NULL means LE, == NUL means BE */ const char *src, /* Source string in Unicode. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ @@ -2359,13 +2358,19 @@ UnicodeToUtfProc( const char *srcStart, *srcEnd; char *dstEnd, *dstStart; int result, numChars; - Tcl_UniChar ch; + unsigned short ch; result = TCL_OK; - if ((srcLen % sizeof(Tcl_UniChar)) != 0) { + + /* check alignment with utf-16 (2 == sizeof(UTF-16)) */ + if ((srcLen % 2) != 0) { + result = TCL_CONVERT_MULTIBYTE; + srcLen--; + } + /* If last code point is a high surrogate, we cannot handle that yet */ + if ((srcLen >= 2) && ((src[srcLen - (clientData?1:2)] & 0xFC) == 0xD8)) { result = TCL_CONVERT_MULTIBYTE; - srcLen /= sizeof(Tcl_UniChar); - srcLen *= sizeof(Tcl_UniChar); + srcLen-= 2; } srcStart = src; @@ -2379,17 +2384,21 @@ UnicodeToUtfProc( result = TCL_CONVERT_NOSPACE; break; } + if (clientData) { + ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF); + } else { + ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF); + } /* - * Special case for 1-byte utf chars for speed. Make sure we - * work with Tcl_UniChar-size data. + * Special case for 1-byte utf chars for speed. Make sure we work with + * unsigned short-size data. */ - ch = *(Tcl_UniChar *)src; if (ch && ch < 0x80) { *dst++ = (ch & 0xFF); } else { dst += Tcl_UniCharToUtf(ch, dst); } - src += sizeof(Tcl_UniChar); + src += sizeof(unsigned short); } *srcReadPtr = src - srcStart; @@ -2403,7 +2412,7 @@ UnicodeToUtfProc( * * UtfToUnicodeProc -- * - * Convert from UTF-8 to Unicode. + * Convert from UTF-8 to UTF-16. * * Results: * Returns TCL_OK if conversion was successful. @@ -2416,8 +2425,7 @@ UnicodeToUtfProc( static int UtfToUnicodeProc( - ClientData clientData, /* TableEncodingData that specifies - * encoding. */ + ClientData clientData, /* != NULL means LE, == NUL means BE */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ @@ -2444,7 +2452,7 @@ UtfToUnicodeProc( { const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; int result, numChars; - Tcl_UniChar ch; + Tcl_UniChar ch = 0; srcStart = src; srcEnd = src + srcLen; @@ -2476,27 +2484,37 @@ UtfToUnicodeProc( * Need to handle this in a way that won't cause misalignment * by casting dst to a Tcl_UniChar. [Bug 1122671] */ -#ifdef WORDS_BIGENDIAN + if (clientData) { #if TCL_UTF_MAX > 4 - *dst++ = (ch >> 24); - *dst++ = ((ch >> 16) & 0xFF); - *dst++ = ((ch >> 8) & 0xFF); - *dst++ = (ch & 0xFF); + if (ch <= 0xFFFF) { + *dst++ = (ch & 0xFF); + *dst++ = (ch >> 8); + } else { + *dst++ = (((ch - 0x10000) >> 10) & 0xFF); + *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8; + *dst++ = (ch & 0xFF); + *dst++ = ((ch & 0x3) >> 8) | 0xDC; + } #else - *dst++ = (ch >> 8); - *dst++ = (ch & 0xFF); + *dst++ = (ch & 0xFF); + *dst++ = (ch >> 8); #endif -#else + } else { #if TCL_UTF_MAX > 4 - *dst++ = (ch & 0xFF); - *dst++ = ((ch >> 8) & 0xFF); - *dst++ = ((ch >> 16) & 0xFF); - *dst++ = (ch >> 24); + if (ch <= 0xFFFF) { + *dst++ = (ch >> 8); + *dst++ = (ch & 0xFF); + } else { + *dst++ = ((ch & 0x3) >> 8) | 0xDC; + *dst++ = (ch & 0xFF); + *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8; + *dst++ = (((ch - 0x10000) >> 10) & 0xFF); + } #else - *dst++ = (ch & 0xFF); - *dst++ = (ch >> 8); -#endif + *dst++ = (ch >> 8); + *dst++ = (ch & 0xFF); #endif + } } *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; @@ -2899,7 +2917,6 @@ Iso88591FromUtfProc( result = TCL_CONVERT_UNKNOWN; break; } - /* * Plunge on, using '?' as a fallback character. */ @@ -3387,14 +3404,13 @@ EscapeFromUtfProc( * * EscapeFreeProc -- * - * This function is invoked when an EscapeEncodingData encoding is - * deleted. It deletes the memory used by the encoding. + * Frees resources used by the encoding. * * Results: * None. * * Side effects: - * Memory freed. + * Memory is freed. * *--------------------------------------------------------------------------- */ -- cgit v0.12 From 2958d5196de3452ea46a083603d4ce1dc0d05d2a Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Mon, 20 Apr 2020 07:50:22 +0000 Subject: Move the needed apt package in .travis.yml to the top, so they can be shared between the images. --- .travis.yml | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/.travis.yml b/.travis.yml index e10ca7c..5672c0b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,15 @@ sudo: false language: c +addons: + apt: + packages: + - binutils-mingw-w64-i686 + - binutils-mingw-w64-x86-64 + - gcc-mingw-w64 + - gcc-mingw-w64-base + - gcc-mingw-w64-i686 + - gcc-mingw-w64-x86-64 + - gcc-multilib matrix: include: # Testing on Linux with various compilers @@ -146,13 +156,6 @@ matrix: os: linux dist: bionic compiler: x86_64-w64-mingw32-gcc - addons: - apt: - packages: - - gcc-mingw-w64-base - - binutils-mingw-w64-x86-64 - - gcc-mingw-w64-x86-64 - - gcc-mingw-w64 env: - BUILD_DIR=win - CFGOPT="--host=x86_64-w64-mingw32 --enable-64bit --enable-threads" @@ -167,14 +170,6 @@ matrix: os: linux dist: bionic compiler: i686-w64-mingw32-gcc - addons: - apt: - packages: - - gcc-mingw-w64-base - - binutils-mingw-w64-i686 - - gcc-mingw-w64-i686 - - gcc-mingw-w64 - - gcc-multilib env: - BUILD_DIR=win - CFGOPT="--host=i686-w64-mingw32 --enable-threads" -- cgit v0.12 From 58cf4db1ccb0602f9bd023ecc4e56830aea2453a Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 20 Apr 2020 22:35:39 +0000 Subject: Tie together the TCL_UTF_MAX=4 and TCL_UTF_MAX=6 builds to mean the same thing on the 8.5 branch -- use internal UCS-4 storage. --- generic/regcustom.h | 2 +- generic/tcl.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/generic/regcustom.h b/generic/regcustom.h index 57a2d47..ac33087 100644 --- a/generic/regcustom.h +++ b/generic/regcustom.h @@ -97,7 +97,7 @@ typedef int celt; /* Type to hold chr, or NOCELT */ #define NOCELT (-1) /* Celt value which is not valid chr */ #define CHR(c) (UCHAR(c)) /* Turn char literal into chr literal */ #define DIGITVAL(c) ((c)-'0') /* Turn chr digit into its value */ -#if TCL_UTF_MAX > 4 +#if TCL_UTF_MAX > 3 #define CHRBITS 32 /* Bits in a chr; must not use sizeof */ #define CHR_MIN 0x00000000 /* Smallest and largest chr; the value */ #define CHR_MAX 0xffffffff /* CHR_MAX-CHR_MIN+1 should fit in uchr */ diff --git a/generic/tcl.h b/generic/tcl.h index 7378a8f..d7d064c 100644 --- a/generic/tcl.h +++ b/generic/tcl.h @@ -2148,7 +2148,7 @@ typedef struct Tcl_Parse { * reflected in regcustom.h. */ -#if TCL_UTF_MAX > 4 +#if TCL_UTF_MAX > 3 /* * unsigned int isn't 100% accurate as it should be a strict 4-byte value * (perhaps wchar_t). 64-bit systems may have troubles. The size of this -- cgit v0.12 From 4c9bc32c393e100ae9caf7f06e57c798f96ada6d Mon Sep 17 00:00:00 2001 From: dgp Date: Mon, 20 Apr 2020 23:00:30 +0000 Subject: Pair every compat85 test with a fullutf test so that we cover all variants. --- tests/utf.test | 137 ++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 107 insertions(+), 30 deletions(-) diff --git a/tests/utf.test b/tests/utf.test index 1ca3647..1c79f32 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -13,7 +13,9 @@ if {[lsearch [namespace children] ::tcltest] == -1} { namespace import -force ::tcltest::* } -testConstraint compat85 [expr {[format %c 0x010000] == "\uFFFD"}] +testConstraint compat85 [expr {[format %c 0x010000] eq "\uFFFD"}] +testConstraint fullutf [expr {[format %c 0x010000] ne "\uFFFD"}] + testConstraint testbytestring [llength [info commands testbytestring]] testConstraint testfindfirst [llength [info commands testfindfirst]] testConstraint testfindlast [llength [info commands testfindlast]] @@ -64,12 +66,18 @@ test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} testbytestrin test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} testbytestring { string length [testbytestring "\xE4\xB9\x8E"] } {1} -test utf-2.8 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring compat85} -body { +test utf-2.8.0 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring compat85} -body { string length [testbytestring "\xF0\x90\x80\x80"] } -result {4} -test utf-2.9 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring compat85} -body { +test utf-2.8.1 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring fullutf} -body { + string length [testbytestring "\xF0\x90\x80\x80"] +} -result {1} +test utf-2.9.0 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring compat85} -body { string length [testbytestring "\xF4\x8F\xBF\xBF"] } -result {4} +test utf-2.9.1 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail} -constraints {testbytestring fullutf} -body { + string length [testbytestring "\xF4\x8F\xBF\xBF"] +} -result {1} test utf-2.10 {Tcl_UtfToUniChar: lead (4-byte) followed by 3 trail, underflow} testbytestring { string length [testbytestring "\xF0\x8F\xBF\xBF"] } {4} @@ -118,9 +126,12 @@ test utf-4.10 {Tcl_NumUtfChars: #u0000, calc len, overcomplete} {testnumutfchars test utf-4.11 {Tcl_NumUtfChars: 3 bytes of 4-byte UTF-8 characater} {testnumutfchars testbytestring} { testnumutfchars [testbytestring \xF0\x9F\x92\xA9] 3 } {3} -test utf-4.12 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring compat85} { +test utf-4.12.0 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring compat85} { testnumutfchars [testbytestring \xF0\x9F\x92\xA9] 4 } {4} +test utf-4.12.1 {Tcl_NumUtfChars: #4-byte UTF-8 character} {testnumutfchars testbytestring fullutf} { + testnumutfchars [testbytestring \xF0\x9F\x92\xA9] 4 +} {1} test utf-5.1 {Tcl_UtfFindFirst} {testfindfirst testbytestring} { testfindfirst [testbytestring "abcbc"] 98 @@ -335,9 +346,12 @@ test utf-6.67 {Tcl_UtfNext} testutfnext { test utf-6.68 {Tcl_UtfNext} testutfnext { testutfnext \xF2\xA0\xA0G } 1 -test utf-6.69 {Tcl_UtfNext} {testutfnext compat85} { +test utf-6.69.0 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0 } 1 +test utf-6.69.1 {Tcl_UtfNext} {testutfnext fullutf} { + testutfnext \xF2\xA0\xA0\xA0 +} 4 test utf-6.70 {Tcl_UtfNext} testutfnext { testutfnext \xF2\xA0\xA0\xD0 } 1 @@ -350,24 +364,42 @@ test utf-6.71 {Tcl_UtfNext} testutfnext { test utf-6.73 {Tcl_UtfNext} testutfnext { testutfnext \xF2\xA0\xA0\xF8 } 1 -test utf-6.74 {Tcl_UtfNext} {testutfnext compat85} { +test utf-6.74.0 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0G } 1 -test utf-6.75 {Tcl_UtfNext} {testutfnext compat85} { +test utf-6.74.1 {Tcl_UtfNext} {testutfnext fullutf} { + testutfnext \xF2\xA0\xA0\xA0G +} 4 +test utf-6.75.0 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0\xA0 } 1 -test utf-6.76 {Tcl_UtfNext} {testutfnext compat85} { +test utf-6.75.1 {Tcl_UtfNext} {testutfnext fullutf} { + testutfnext \xF2\xA0\xA0\xA0\xA0 +} 4 +test utf-6.76.0 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0\xD0 } 1 -test utf-6.77 {Tcl_UtfNext} {testutfnext compat85} { +test utf-6.76.1 {Tcl_UtfNext} {testutfnext fullutf} { + testutfnext \xF2\xA0\xA0\xA0\xD0 +} 4 +test utf-6.77.0 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0\xE8 } 1 -test utf-6.78 {Tcl_UtfNext} {testutfnext compat85} { +test utf-6.77.1 {Tcl_UtfNext} {testutfnext fullutf} { + testutfnext \xF2\xA0\xA0\xA0\xE8 +} 4 +test utf-6.78.0 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0\xF2 } 1 -test utf-6.79 {Tcl_UtfNext} {testutfnext compat85} { +test utf-6.78.1 {Tcl_UtfNext} {testutfnext fullutf} { + testutfnext \xF2\xA0\xA0\xA0\xF2 +} 4 +test utf-6.79.0 {Tcl_UtfNext} {testutfnext compat85} { testutfnext \xF2\xA0\xA0\xA0G\xF8 } 1 +test utf-6.79.1 {Tcl_UtfNext} {testutfnext fullutf} { + testutfnext \xF2\xA0\xA0\xA0G\xF8 +} 4 test utf-6.80 {Tcl_UtfNext - overlong sequences} testutfnext { testutfnext \xC0\x80 } 2 @@ -389,9 +421,12 @@ test utf-6.85 {Tcl_UtfNext - overlong sequences} testutfnext { test utf-6.86 {Tcl_UtfNext - overlong sequences} testutfnext { testutfnext \xF0\x80\x80\x80 } 1 -test utf-6.87 {Tcl_UtfNext - overlong sequences} {testutfnext compat85} { +test utf-6.87.0 {Tcl_UtfNext - overlong sequences} {testutfnext compat85} { testutfnext \xF0\x90\x80\x80 } 1 +test utf-6.87.1 {Tcl_UtfNext - overlong sequences} {testutfnext fullutf} { + testutfnext \xF0\x90\x80\x80 +} 4 test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} { testutfnext \xA0\xA0 } 1 @@ -404,9 +439,12 @@ test utf-6.89 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {te test utf-6.89.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext} { testutfnext \xF0\x80\x80 1 } 2 -test utf-6.90 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext compat85} { +test utf-6.90.0 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext compat85} { testutfnext \xF4\x8F\xBF\xBF } 1 +test utf-6.90.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext fullutf} { + testutfnext \xF4\x8F\xBF\xBF +} 4 test utf-6.91 {Tcl_UtfNext, validity check [493dccc2de]} testutfnext { testutfnext \xF4\x90\x80\x80 } 1 @@ -474,15 +512,24 @@ test utf-7.9.1 {Tcl_UtfPrev} testutfprev { test utf-7.9.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xF8\xA0 3 } 2 -test utf-7.10 {Tcl_UtfPrev} {testutfprev compat85} { +test utf-7.10.0 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0 } 2 -test utf-7.10.1 {Tcl_UtfPrev} {testutfprev compat85} { +test utf-7.10.1 {Tcl_UtfPrev} {testutfprev fullutf} { + testutfprev A\xF2\xA0 +} 1 +test utf-7.10.1.0 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0\xA0 3 } 2 -test utf-7.10.2 {Tcl_UtfPrev} {testutfprev compat85} { +test utf-7.10.1.1 {Tcl_UtfPrev} {testutfprev fullutf} { + testutfprev A\xF2\xA0\xA0\xA0 3 +} 1 +test utf-7.10.2.0 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xF8\xA0 3 } 2 +test utf-7.10.2.1 {Tcl_UtfPrev} {testutfprev fullutf} { + testutfprev A\xF2\xA0\xF8\xA0 3 +} 1 test utf-7.11 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0 } 1 @@ -522,15 +569,24 @@ test utf-7.14.1 {Tcl_UtfPrev} testutfprev { test utf-7.14.2 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xF8 4 } 3 -test utf-7.15 {Tcl_UtfPrev} {testutfprev compat85} { +test utf-7.15.0 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0 } 3 -test utf-7.15.1 {Tcl_UtfPrev} {testutfprev compat85} { +test utf-7.15.1 {Tcl_UtfPrev} {testutfprev fullutf} { + testutfprev A\xF2\xA0\xA0 +} 1 +test utf-7.15.1.0 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0\xA0 4 } 3 -test utf-7.15.2 {Tcl_UtfPrev} {testutfprev compat85} { +test utf-7.15.1.1 {Tcl_UtfPrev} {testutfprev fullutf} { + testutfprev A\xF2\xA0\xA0\xA0 4 +} 1 +test utf-7.15.2.0 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0\xF8 4 } 3 +test utf-7.15.2.1 {Tcl_UtfPrev} {testutfprev fullutf} { + testutfprev A\xF2\xA0\xA0\xF8 4 +} 1 test utf-7.16 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0\xA0 } 1 @@ -561,9 +617,12 @@ test utf-7.18.2 {Tcl_UtfPrev} testutfprev { test utf-7.19 {Tcl_UtfPrev} testutfprev { testutfprev A\xF8\xA0\xA0\xA0 } 4 -test utf-7.20 {Tcl_UtfPrev} {testutfprev compat85} { +test utf-7.20.0 {Tcl_UtfPrev} {testutfprev compat85} { testutfprev A\xF2\xA0\xA0\xA0 } 4 +test utf-7.20.1 {Tcl_UtfPrev} {testutfprev fullutf} { + testutfprev A\xF2\xA0\xA0\xA0 +} 1 test utf-7.21 {Tcl_UtfPrev} testutfprev { testutfprev A\xE8\xA0\xA0\xA0 } 4 @@ -624,15 +683,24 @@ test utf-7.37 {Tcl_UtfPrev -- overlong sequence} testutfprev { test utf-7.38 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xE0\xA0\x80 2 } 1 -test utf-7.39 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { +test utf-7.39.0 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { testutfprev A\xF0\x90\x80\x80 } 4 -test utf-7.40 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { +test utf-7.39.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev fullutf} { + testutfprev A\xF0\x90\x80\x80 +} 1 +test utf-7.40.0 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { testutfprev A\xF0\x90\x80\x80 4 } 3 -test utf-7.41 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { +test utf-7.40.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev fullutf} { + testutfprev A\xF0\x90\x80\x80 4 +} 1 +test utf-7.41.0 {Tcl_UtfPrev -- overlong sequence} {testutfprev compat85} { testutfprev A\xF0\x90\x80\x80 3 } 2 +test utf-7.41.1 {Tcl_UtfPrev -- overlong sequence} {testutfprev fullutf} { + testutfprev A\xF0\x90\x80\x80 3 +} 1 test utf-7.42 {Tcl_UtfPrev -- overlong sequence} testutfprev { testutfprev A\xF0\x90\x80\x80 2 } 1 @@ -657,15 +725,24 @@ test utf-7.47.1 {Tcl_UtfPrev, pointing to 3th byte of 3-byte valid sequence} {te test utf-7.47.2 {Tcl_UtfPrev, pointing to 3th byte of 3-byte invalid sequence} {testutfprev} { testutfprev \xE8\xA0\x00 2 } 0 -test utf-7.48 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { +test utf-7.48.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { testutfprev A\xF4\x8F\xBF\xBF } 4 -test utf-7.48.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { +test utf-7.48.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev fullutf} { + testutfprev A\xF4\x8F\xBF\xBF +} 1 +test utf-7.48.1.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { testutfprev A\xF4\x8F\xBF\xBF 4 } 3 -test utf-7.48.2 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { +test utf-7.48.1.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev fullutf} { + testutfprev A\xF4\x8F\xBF\xBF 4 +} 1 +test utf-7.48.2.0 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev compat85} { testutfprev A\xF4\x8F\xBF\xBF 3 } 2 +test utf-7.48.2.1 {Tcl_UtfPrev, validity check [493dccc2de]} {testutfprev fullutf} { + testutfprev A\xF4\x8F\xBF\xBF 3 +} 1 test utf-7.48.3 {Tcl_UtfPrev, validity check [493dccc2de]} testutfprev { testutfprev A\xF4\x8F\xBF\xBF 2 } 1 @@ -700,10 +777,10 @@ test utf-8.5 {Tcl_UniCharAtIndex: high surrogate} { test utf-8.6 {Tcl_UniCharAtIndex: low surrogate} { string index \uDC42 0 } "\uDC42" -test utf-8.7 {Tcl_UniCharAtIndex: Emoji} compat85 { +test utf-8.7 {Tcl_UniCharAtIndex: Emoji} { string index \uD83D\uDE00 0 } "\uD83D" -test utf-8.8 {Tcl_UniCharAtIndex: Emoji} compat85 { +test utf-8.8 {Tcl_UniCharAtIndex: Emoji} { string index \uD83D\uDE00 1 } "\uDE00" @@ -713,10 +790,10 @@ test utf-9.1 {Tcl_UtfAtIndex: index = 0} { test utf-9.2 {Tcl_UtfAtIndex: index > 0} { string range \u4E4E\u25A\xFF\u543klmnop 1 5 } "\u25A\xFF\u543kl" -test utf-9.3 {Tcl_UtfAtIndex: index = 0, Emoji} compat85 { +test utf-9.3 {Tcl_UtfAtIndex: index = 0, Emoji} { string range \uD83D\uDE00G 0 0 } "\uD83D" -test utf-9.4 {Tcl_UtfAtIndex: index > 0, Emoji} compat85 { +test utf-9.4 {Tcl_UtfAtIndex: index > 0, Emoji} { string range \uD83D\uDE00G 1 1 } "\uDE00" -- cgit v0.12 From d8bc590eef94f9e9ec24150cf8208f38638290ef Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 21 Apr 2020 02:50:23 +0000 Subject: Revert the backport to tclEncoding.c that seems to redefine the "unicode" encoding to mean UTF-16. Don't want that behavior change in 8.5. --- generic/tclEncoding.c | 240 +++++++++++++++++++++++--------------------------- 1 file changed, 112 insertions(+), 128 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index da03055..5a9d2d5 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -83,7 +83,7 @@ typedef struct TableEncodingData { } TableEncodingData; /* - * Each of the following structures is the clientData for a dynamically-loaded + * The following structures is the clientData for a dynamically-loaded, * escape-driven encoding that is itself comprised of other simpler encodings. * An example is "iso-2022-jp", which uses escape sequences to switch between * ascii, jis0208, jis0212, gb2312, and ksc5601. Note that "escape-driven" @@ -117,8 +117,8 @@ typedef struct EscapeEncodingData { * 0. */ int numSubTables; /* Length of following array. */ EscapeSubTable subTables[1];/* Information about each EscapeSubTable used - * by this encoding type. The actual size is - * as large as necessary to hold all + * by this encoding type. The actual size will + * be as large as necessary to hold all * EscapeSubTables. */ } EscapeEncodingData; @@ -156,7 +156,7 @@ static ProcessGlobalValue encodingFileMap = { * A list of directories making up the "library path". Historically this * search path has served many uses, but the only one remaining is a base for * the encodingSearchPath above. If the application does not explicitly set - * the encodingSearchPath, then it is initialized by appending /encoding + * the encodingSearchPath, then it will be initialized by appending /encoding * to each directory in this "libraryPath". */ @@ -177,7 +177,7 @@ TCL_DECLARE_MUTEX(encodingMutex) /* * The following are used to hold the default and current system encodings. * If NULL is passed to one of the conversion routines, the current setting of - * the system encoding is used to perform the conversion. + * the system encoding will be used to perform the conversion. */ static Tcl_Encoding defaultEncoding; @@ -429,8 +429,9 @@ TclGetLibraryPath(void) * Keeps the per-thread copy of the library path current with changes to * the global copy. * - * Since the result of this routine is void, if searchPath is not a valid - * list this routine silently does nothing. + * NOTE: this routine returns void, so there's no way to report the error + * that searchPath is not a valid list. In that case, this routine will + * silently do nothing. * *---------------------------------------------------------------------- */ @@ -452,16 +453,17 @@ TclSetLibraryPath( * * FillEncodingFileMap -- * - * Called to update the encoding file map with the current value - * of the encoding search path. + * Called to bring the encoding file map in sync with the current value + * of the encoding search path. * - * Finds *.end files in the directories on the encoding search path and - * stores the found pathnames in a map associated with the encoding name. + * Scan the directories on the encoding search path, find the *.enc + * files, and store the found pathnames in a map associated with the + * encoding name. * - * If $dir is on the encoding search path and the file $dir/foo.enc is - * found, stores a "foo" -> $dir entry in the map. if the "foo" encoding - * is needed later, the $dir/foo.enc name can be quickly constructed in - * order to read the encoding data. + * In particular, if $dir is on the encoding search path, and the file + * $dir/foo.enc is found, then store a "foo" -> $dir entry in the map. + * Later, any need for the "foo" encoding will quickly * be able to + * construct the $dir/foo.enc pathname for reading the encoding data. * * Results: * None. @@ -542,24 +544,19 @@ void TclInitEncodingSubsystem(void) { Tcl_EncodingType type; - union { - char c; - short s; - } isLe; if (encodingsInitialized) { return; } - isLe.s = 1; Tcl_MutexLock(&encodingMutex); Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS); Tcl_MutexUnlock(&encodingMutex); /* - * Create a few initial encodings. UTF-8 to UTF-8 translation is not a - * no-op because it turns a stream of improperly formed UTF-8 into a - * properly formed stream. + * Create a few initial encodings. Note that the UTF-8 to UTF-8 + * translation is not a no-op, because it will turn a stream of improperly + * formed UTF-8 into a properly formed stream. */ type.encodingName = "identity"; @@ -586,7 +583,7 @@ TclInitEncodingSubsystem(void) type.fromUtfProc = UtfToUnicodeProc; type.freeProc = NULL; type.nullSize = 2; - type.clientData = INT2PTR(isLe.c); + type.clientData = NULL; Tcl_CreateEncoding(&type); /* @@ -758,7 +755,11 @@ Tcl_SetDefaultEncodingDir( * interp was NULL. * * Side effects: - * LoadEncodingFile is called if necessary. + * The new encoding type is entered into a table visible to all + * interpreters, keyed off the encoding's name. For each call to this + * function, there should eventually be a call to Tcl_FreeEncoding, so + * that the database can be cleaned up when encodings aren't needed + * anymore. * *------------------------------------------------------------------------- */ @@ -796,15 +797,15 @@ Tcl_GetEncoding( * * Tcl_FreeEncoding -- * - * Releases an encoding allocated by Tcl_CreateEncoding() or - * Tcl_GetEncoding(). + * This function is called to release an encoding allocated by + * Tcl_CreateEncoding() or Tcl_GetEncoding(). * * Results: * None. * * Side effects: * The reference count associated with the encoding is decremented and - * the encoding is deleted if nothing is using it anymore. + * the encoding may be deleted if nothing is using it anymore. * *--------------------------------------------------------------------------- */ @@ -823,14 +824,13 @@ Tcl_FreeEncoding( * * FreeEncoding -- * - * Decrements the reference count of an encoding. The caller must hold - * encodingMutes. + * This function is called to release an encoding by functions that + * already have the encodingMutex. * * Results: * None. * * Side effects: - * Releases the resource for an encoding if it is now unused. * The reference count associated with the encoding is decremented and * the encoding may be deleted if nothing is using it anymore. * @@ -850,17 +850,16 @@ FreeEncoding( if (encodingPtr->refCount<=0) { Tcl_Panic("FreeEncoding: refcount problem !!!"); } - if (encodingPtr->refCount-- <= 1) { + encodingPtr->refCount--; + if (encodingPtr->refCount == 0) { if (encodingPtr->freeProc != NULL) { (*encodingPtr->freeProc)(encodingPtr->clientData); } if (encodingPtr->hPtr != NULL) { Tcl_DeleteHashEntry(encodingPtr->hPtr); } - if (encodingPtr->name) { - ckfree((char *)encodingPtr->name); - } - ckfree((char *)encodingPtr); + ckfree((char *) encodingPtr->name); + ckfree((char *) encodingPtr); } } @@ -1021,22 +1020,23 @@ Tcl_SetSystemEncoding( * * Tcl_CreateEncoding -- * - * Defines a new encoding, along with the functions that are used to - * convert to and from Unicode. + * This function is called to define a new encoding and the functions + * that are used to convert between the specified encoding and Unicode. * * Results: * Returns a token that represents the encoding. If an encoding with the * same name already existed, the old encoding token remains valid and - * continues to behave as it used to, and is eventually garbage collected - * when the last reference to it goes away. Any subsequent calls to - * Tcl_GetEncoding with the specified name retrieve the most recent - * encoding token. + * continues to behave as it used to, and will eventually be garbage + * collected when the last reference to it goes away. Any subsequent + * calls to Tcl_GetEncoding with the specified name will retrieve the + * most recent encoding token. * * Side effects: - * A new record having the name of the encoding is entered into a table of - * encodings visible to all interpreters. For each call to this function, - * there should eventually be a call to Tcl_FreeEncoding, which cleans - * deletes the record in the table when an encoding is no longer needed. + * The new encoding type is entered into a table visible to all + * interpreters, keyed off the encoding's name. For each call to this + * function, there should eventually be a call to Tcl_FreeEncoding, so + * that the database can be cleaned up when encodings aren't needed + * anymore. * *--------------------------------------------------------------------------- */ @@ -1258,9 +1258,10 @@ Tcl_ExternalToUtf( * * Tcl_UtfToExternalDString -- * - * Convert a source buffer from UTF-8 to the specified encoding. If any + * Convert a source buffer from UTF-8 into the specified encoding. If any * of the bytes in the source buffer are invalid or cannot be represented - * in the target encoding, a default fallback character is substituted. + * in the target encoding, a default fallback character will be + * substituted. * * Results: * The converted bytes are stored in the DString, which is then NULL @@ -1569,13 +1570,13 @@ OpenEncodingFileChannel( * the data. * * Results: - * The return value is the newly loaded Tcl_Encoding or NULL if the file - * didn't exist or could not be processed. If NULL is returned and interp - * is not NULL, an error message is left in interp's result object. + * The return value is the newly loaded Encoding, or NULL if the file + * didn't exist of was in the incorrect format. If NULL was returned, an + * error message is left in interp's result object, unless interp was + * NULL. * * Side effects: - * A corresponding encoding file might be read from persistent storage, in - * which case LoadTableEncoding is called. + * File read from disk. * *--------------------------------------------------------------------------- */ @@ -1583,8 +1584,8 @@ OpenEncodingFileChannel( static Tcl_Encoding LoadEncodingFile( Tcl_Interp *interp, /* Interp for error reporting, if not NULL. */ - const char *name) /* The name of both the encoding file - * and the new encoding. */ + const char *name) /* The name of the encoding file on disk and + * also the name for new encoding. */ { Tcl_Channel chan = NULL; Tcl_Encoding encoding = NULL; @@ -1636,27 +1637,27 @@ LoadEncodingFile( * * LoadTableEncoding -- * - * Helper function for LoadEncodingFile(). Creates a Tcl_EncodingType - * structure along with its corresponding TableEncodingData structure, and - * passes it to Tcl_Createncoding. + * Helper function for LoadEncodingTable(). Loads a table to that + * converts between Unicode and some other encoding and creates an + * encoding (using a TableEncoding structure) from that information. * - * The file contains binary data but begins with a marker to indicate - * byte-ordering so a single binary file can be read on big or - * little-endian systems. + * File contains binary data, but begins with a marker to indicate + * byte-ordering, so that same binary file can be read on either endian + * platforms. * * Results: - * Returns the new Tcl_Encoding, or NULL if it could could - * not be created because the file contained invalid data. + * The return value is the new encoding, or NULL if the encoding could + * not be created (because the file contained invalid data). * * Side effects: - * See Tcl_CreateEncoding(). + * None. * *------------------------------------------------------------------------- */ static Tcl_Encoding LoadTableEncoding( - const char *name, /* Name of the new encoding. */ + const char *name, /* Name for new encoding. */ int type, /* Type of encoding (ENCODING_?????). */ Tcl_Channel chan) /* File containing new encoding. */ { @@ -1768,10 +1769,10 @@ LoadTableEncoding( } /* - * Invert the toUnicode array to produce the fromUnicode array. Performs a + * Invert toUnicode array to produce the fromUnicode array. Performs a * single malloc to get the memory for the array and all the pages needed - * by the array. While reading in the toUnicode array remember what - * pages are needed for the fromUnicode array. + * by the array. While reading in the toUnicode array, we remembered what + * pages that would be needed for the fromUnicode array. */ if (symbol) { @@ -1813,8 +1814,8 @@ LoadTableEncoding( if (type == ENCODING_MULTIBYTE) { /* * If multibyte encodings don't have a backslash character, define - * one. Otherwise, on Windows, native file names don't work because - * the backslash in the file name maps to the unknown character + * one. Otherwise, on Windows, native file names won't work because + * the backslash in the file name will map to the unknown character * (question mark) when converting from UTF-8 to external encoding. */ @@ -1828,13 +1829,13 @@ LoadTableEncoding( unsigned short *page; /* - * Make a special symbol encoding that maps each symbol character from - * its Unicode code point down into page 0, and also ensure that each - * characters on page 0 maps to itself so that a symbol font can be - * used to display a simple string like "abcd" and have alpha, beta, - * chi, delta show up, rather than have "unknown" chars show up because - * strictly speaking the symbol font doesn't have glyphs for those low - * ASCII chars. + * Make a special symbol encoding that not only maps the symbol + * characters from their Unicode code points down into page 0, but + * also ensure that the characters on page 0 map to themselves. This + * is so that a symbol font can be used to display a simple string + * like "abcd" and have alpha, beta, chi, delta show up, rather than + * have "unknown" chars show up because strictly speaking the symbol + * font doesn't have glyphs for those low ascii chars. */ page = dataPtr->fromUnicode[0]; @@ -1938,7 +1939,7 @@ LoadTableEncoding( static Tcl_Encoding LoadEscapeEncoding( - const char *name, /* Name of the new encoding. */ + const char *name, /* Name for new encoding. */ Tcl_Channel chan) /* File containing new encoding. */ { int i; @@ -2317,7 +2318,7 @@ UtfToUtfProc( * * UnicodeToUtfProc -- * - * Convert from UTF-16 to UTF-8. + * Convert from Unicode to UTF-8. * * Results: * Returns TCL_OK if conversion was successful. @@ -2330,7 +2331,7 @@ UtfToUtfProc( static int UnicodeToUtfProc( - ClientData clientData, /* != NULL means LE, == NUL means BE */ + ClientData clientData, /* Not used. */ const char *src, /* Source string in Unicode. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ @@ -2358,19 +2359,13 @@ UnicodeToUtfProc( const char *srcStart, *srcEnd; char *dstEnd, *dstStart; int result, numChars; - unsigned short ch; + Tcl_UniChar ch; result = TCL_OK; - - /* check alignment with utf-16 (2 == sizeof(UTF-16)) */ - if ((srcLen % 2) != 0) { - result = TCL_CONVERT_MULTIBYTE; - srcLen--; - } - /* If last code point is a high surrogate, we cannot handle that yet */ - if ((srcLen >= 2) && ((src[srcLen - (clientData?1:2)] & 0xFC) == 0xD8)) { + if ((srcLen % sizeof(Tcl_UniChar)) != 0) { result = TCL_CONVERT_MULTIBYTE; - srcLen-= 2; + srcLen /= sizeof(Tcl_UniChar); + srcLen *= sizeof(Tcl_UniChar); } srcStart = src; @@ -2384,21 +2379,17 @@ UnicodeToUtfProc( result = TCL_CONVERT_NOSPACE; break; } - if (clientData) { - ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF); - } else { - ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF); - } /* - * Special case for 1-byte utf chars for speed. Make sure we work with - * unsigned short-size data. + * Special case for 1-byte utf chars for speed. Make sure we + * work with Tcl_UniChar-size data. */ + ch = *(Tcl_UniChar *)src; if (ch && ch < 0x80) { *dst++ = (ch & 0xFF); } else { dst += Tcl_UniCharToUtf(ch, dst); } - src += sizeof(unsigned short); + src += sizeof(Tcl_UniChar); } *srcReadPtr = src - srcStart; @@ -2412,7 +2403,7 @@ UnicodeToUtfProc( * * UtfToUnicodeProc -- * - * Convert from UTF-8 to UTF-16. + * Convert from UTF-8 to Unicode. * * Results: * Returns TCL_OK if conversion was successful. @@ -2425,7 +2416,8 @@ UnicodeToUtfProc( static int UtfToUnicodeProc( - ClientData clientData, /* != NULL means LE, == NUL means BE */ + ClientData clientData, /* TableEncodingData that specifies + * encoding. */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ @@ -2452,7 +2444,7 @@ UtfToUnicodeProc( { const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; int result, numChars; - Tcl_UniChar ch = 0; + Tcl_UniChar ch; srcStart = src; srcEnd = src + srcLen; @@ -2484,37 +2476,27 @@ UtfToUnicodeProc( * Need to handle this in a way that won't cause misalignment * by casting dst to a Tcl_UniChar. [Bug 1122671] */ - if (clientData) { +#ifdef WORDS_BIGENDIAN #if TCL_UTF_MAX > 4 - if (ch <= 0xFFFF) { - *dst++ = (ch & 0xFF); - *dst++ = (ch >> 8); - } else { - *dst++ = (((ch - 0x10000) >> 10) & 0xFF); - *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8; - *dst++ = (ch & 0xFF); - *dst++ = ((ch & 0x3) >> 8) | 0xDC; - } + *dst++ = (ch >> 24); + *dst++ = ((ch >> 16) & 0xFF); + *dst++ = ((ch >> 8) & 0xFF); + *dst++ = (ch & 0xFF); #else - *dst++ = (ch & 0xFF); - *dst++ = (ch >> 8); + *dst++ = (ch >> 8); + *dst++ = (ch & 0xFF); #endif - } else { +#else #if TCL_UTF_MAX > 4 - if (ch <= 0xFFFF) { - *dst++ = (ch >> 8); - *dst++ = (ch & 0xFF); - } else { - *dst++ = ((ch & 0x3) >> 8) | 0xDC; - *dst++ = (ch & 0xFF); - *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8; - *dst++ = (((ch - 0x10000) >> 10) & 0xFF); - } + *dst++ = (ch & 0xFF); + *dst++ = ((ch >> 8) & 0xFF); + *dst++ = ((ch >> 16) & 0xFF); + *dst++ = (ch >> 24); #else - *dst++ = (ch >> 8); - *dst++ = (ch & 0xFF); + *dst++ = (ch & 0xFF); + *dst++ = (ch >> 8); +#endif #endif - } } *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; @@ -2917,6 +2899,7 @@ Iso88591FromUtfProc( result = TCL_CONVERT_UNKNOWN; break; } + /* * Plunge on, using '?' as a fallback character. */ @@ -3404,13 +3387,14 @@ EscapeFromUtfProc( * * EscapeFreeProc -- * - * Frees resources used by the encoding. + * This function is invoked when an EscapeEncodingData encoding is + * deleted. It deletes the memory used by the encoding. * * Results: * None. * * Side effects: - * Memory is freed. + * Memory freed. * *--------------------------------------------------------------------------- */ -- cgit v0.12 From 5ccd380c46e3e74f3273ecfa83b0686bca5e8056 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 21 Apr 2020 02:57:41 +0000 Subject: We've settled on using (TCL_UTF_MAX > 3) to indicate 4-byte Tcl_UniChar. --- generic/tclEncoding.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 5a9d2d5..66bec44 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2477,7 +2477,7 @@ UtfToUnicodeProc( * by casting dst to a Tcl_UniChar. [Bug 1122671] */ #ifdef WORDS_BIGENDIAN -#if TCL_UTF_MAX > 4 +#if TCL_UTF_MAX > 3 *dst++ = (ch >> 24); *dst++ = ((ch >> 16) & 0xFF); *dst++ = ((ch >> 8) & 0xFF); @@ -2487,7 +2487,7 @@ UtfToUnicodeProc( *dst++ = (ch & 0xFF); #endif #else -#if TCL_UTF_MAX > 4 +#if TCL_UTF_MAX > 3 *dst++ = (ch & 0xFF); *dst++ = ((ch >> 8) & 0xFF); *dst++ = ((ch >> 16) & 0xFF); -- cgit v0.12 From 941ef44a3fce68b1dc81abb397a80f209b2ca982 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 21 Apr 2020 14:52:48 +0000 Subject: Revert the other encoding system backport. The blocking and failing tests are illustrations of existing tickets [1004065] and [1122671], recording that the encoding machinery hardcodes assumptions in multiple places that sizeof(Tcl_UniChar) == 2. Closing the segfault bug fix should not be hostage to fixing those old bugs. --- generic/tclEncoding.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/generic/tclEncoding.c b/generic/tclEncoding.c index 66bec44..6c16827 100644 --- a/generic/tclEncoding.c +++ b/generic/tclEncoding.c @@ -2470,33 +2470,20 @@ UtfToUnicodeProc( if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; break; - } + } src += TclUtfToUniChar(src, &ch); /* * Need to handle this in a way that won't cause misalignment * by casting dst to a Tcl_UniChar. [Bug 1122671] + * XXX: This hard-codes the assumed size of Tcl_UniChar as 2. */ #ifdef WORDS_BIGENDIAN -#if TCL_UTF_MAX > 3 - *dst++ = (ch >> 24); - *dst++ = ((ch >> 16) & 0xFF); - *dst++ = ((ch >> 8) & 0xFF); - *dst++ = (ch & 0xFF); -#else *dst++ = (ch >> 8); *dst++ = (ch & 0xFF); -#endif -#else -#if TCL_UTF_MAX > 3 - *dst++ = (ch & 0xFF); - *dst++ = ((ch >> 8) & 0xFF); - *dst++ = ((ch >> 16) & 0xFF); - *dst++ = (ch >> 24); #else *dst++ = (ch & 0xFF); *dst++ = (ch >> 8); #endif -#endif } *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; -- cgit v0.12 From d48bca33242b3f10d21a25a6c6a91c27ae707b96 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 21 Apr 2020 15:41:01 +0000 Subject: Move testing command [testsize] from Windows to generic. Extend it to report sizeof(Tcl_UniChar). --- generic/tclTest.c | 31 +++++++++++++++++++++++++++++++ win/tclWinTest.c | 28 ---------------------------- 2 files changed, 31 insertions(+), 28 deletions(-) diff --git a/generic/tclTest.c b/generic/tclTest.c index 8c29aa7..b9fd204 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -281,6 +281,7 @@ static Tcl_CmdProc Testset2Cmd; static Tcl_CmdProc TestseterrorcodeCmd; static Tcl_ObjCmdProc TestsetobjerrorcodeCmd; static Tcl_CmdProc TestsetplatformCmd; +static Tcl_ObjCmdProc TestsizeCmd; static Tcl_CmdProc TeststaticpkgCmd; static Tcl_CmdProc TesttranslatefilenameCmd; static Tcl_CmdProc TestupvarCmd; @@ -592,6 +593,7 @@ Tcltest_Init( TestFindLastCmd, NULL, NULL); Tcl_CreateCommand(interp, "testsetplatform", TestsetplatformCmd, NULL, NULL); + Tcl_CreateObjCommand(interp, "testsize", TestsizeCmd, NULL, NULL); Tcl_CreateCommand(interp, "teststaticpkg", TeststaticpkgCmd, NULL, NULL); Tcl_CreateCommand(interp, "testtranslatefilename", @@ -4122,6 +4124,35 @@ TestsetplatformCmd( return TCL_OK; } +static int +TestsizeCmd( + ClientData clientData, /* Unused */ + Tcl_Interp* interp, /* Tcl interpreter */ + int objc, /* Parameter count */ + Tcl_Obj *const * objv) /* Parameter vector */ +{ + if (objc != 2) { + goto syntax; + } + if (strcmp(Tcl_GetString(objv[1]), "time_t") == 0) { + Tcl_SetObjResult(interp, Tcl_NewWideIntObj(sizeof(time_t))); + return TCL_OK; + } + if (strcmp(Tcl_GetString(objv[1]), "st_mtime") == 0) { + Tcl_StatBuf *statPtr; + Tcl_SetObjResult(interp, Tcl_NewWideIntObj(sizeof(statPtr->st_mtime))); + return TCL_OK; + } + if (strcmp(Tcl_GetString(objv[1]), "unichar") == 0) { + Tcl_SetObjResult(interp, Tcl_NewWideIntObj(sizeof(Tcl_UniChar))); + return TCL_OK; + } + +syntax: + Tcl_WrongNumArgs(interp, 1, objv, "time_t|st_mtime|unichar"); + return TCL_ERROR; +} + /* *---------------------------------------------------------------------- * diff --git a/win/tclWinTest.c b/win/tclWinTest.c index 04878fe..7f49b63 100644 --- a/win/tclWinTest.c +++ b/win/tclWinTest.c @@ -39,8 +39,6 @@ static int TestwinclockCmd(ClientData dummy, Tcl_Interp* interp, int objc, Tcl_Obj *const objv[]); static int TestwinsleepCmd(ClientData dummy, Tcl_Interp* interp, int objc, Tcl_Obj *const objv[]); -static int TestSizeCmd(ClientData dummy, Tcl_Interp* interp, - int objc, Tcl_Obj *const objv[]); static Tcl_ObjCmdProc TestExceptionCmd; static int TestplatformChmod(const char *nativePath, int pmode); static int TestchmodCmd(ClientData dummy, @@ -78,7 +76,6 @@ TclplatformtestInit( Tcl_CreateObjCommand(interp, "testwinclock", TestwinclockCmd, NULL, NULL); Tcl_CreateObjCommand(interp, "testwinsleep", TestwinsleepCmd, NULL, NULL); Tcl_CreateObjCommand(interp, "testexcept", TestExceptionCmd, NULL, NULL); - Tcl_CreateObjCommand(interp, "testsize", TestSizeCmd, NULL, NULL); return TCL_OK; } @@ -312,31 +309,6 @@ TestwinsleepCmd( return TCL_OK; } -static int -TestSizeCmd( - ClientData clientData, /* Unused */ - Tcl_Interp* interp, /* Tcl interpreter */ - int objc, /* Parameter count */ - Tcl_Obj *const * objv) /* Parameter vector */ -{ - if (objc != 2) { - goto syntax; - } - if (strcmp(Tcl_GetString(objv[1]), "time_t") == 0) { - Tcl_SetObjResult(interp, Tcl_NewWideIntObj(sizeof(time_t))); - return TCL_OK; - } - if (strcmp(Tcl_GetString(objv[1]), "st_mtime") == 0) { - Tcl_StatBuf *statPtr; - Tcl_SetObjResult(interp, Tcl_NewWideIntObj(sizeof(statPtr->st_mtime))); - return TCL_OK; - } - -syntax: - Tcl_WrongNumArgs(interp, 1, objv, "time_t|st_mtime"); - return TCL_ERROR; -} - /* *---------------------------------------------------------------------- * -- cgit v0.12 From 1588b8475d8a1378e6e9504e10913d756d84983b Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 21 Apr 2020 15:52:03 +0000 Subject: Use new testing command to constrain tests to (sizeof(Tcl_UniChar) == 2) until bugs are fixed when (sizeof(Tcl_UniChar == 4). --- tests/chanio.test | 4 +++- tests/encoding.test | 6 +++++- tests/io.test | 5 ++++- tests/source.test | 5 ++++- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/tests/chanio.test b/tests/chanio.test index 5fae431..c2f561b 100644 --- a/tests/chanio.test +++ b/tests/chanio.test @@ -29,6 +29,8 @@ namespace eval ::tcl::test::io { variable msg variable expected + testConstraint ucs2 [expr { [llength [info commands testsize]] && + ([testsize unichar] == 2) }] testConstraint testchannel [llength [info commands testchannel]] testConstraint exec [llength [info commands exec]] testConstraint openpipe 1 @@ -875,7 +877,7 @@ test chan-io-6.44 {Tcl_GetsObj: input saw cr, not followed by cr} {stdio testcha chan close $f set x } [list "bbbbbbbbbbbbbbb" 15 "123456789abcdef" 1 4 "abcd" 0 3 "efg"] -test chan-io-6.45 {Tcl_GetsObj: input saw cr, skip right number of bytes} {stdio testchannel openpipe fileevent} { +test chan-io-6.45 {Tcl_GetsObj: input saw cr, skip right number of bytes} {stdio testchannel openpipe fileevent ucs2} { # Tcl_ExternalToUtf() set f [open "|[list [interpreter] $path(cat)]" w+] diff --git a/tests/encoding.test b/tests/encoding.test index 8722a93..ad55e26 100644 --- a/tests/encoding.test +++ b/tests/encoding.test @@ -32,6 +32,10 @@ proc runtests {} { testConstraint testencoding [llength [info commands testencoding]] testConstraint exec [llength [info commands exec]] +testConstraint ucs2 [expr { [llength [info commands testsize]] && + ([testsize unichar] == 2) }] + + # TclInitEncodingSubsystem is tested by the rest of this file # TclFinalizeEncodingSubsystem is not currently tested @@ -316,7 +320,7 @@ test encoding-15.3 {UtfToUtfProc null character input} { list [string bytelength $x] [string bytelength $y] $z } {1 2 c080} -test encoding-16.1 {UnicodeToUtfProc} { +test encoding-16.1 {UnicodeToUtfProc} ucs2 { set val [encoding convertfrom unicode NN] list $val [format %x [scan $val %c]] } "\u4e4e 4e4e" diff --git a/tests/io.test b/tests/io.test index 04fa1d2..1c18576 100644 --- a/tests/io.test +++ b/tests/io.test @@ -29,6 +29,9 @@ namespace eval ::tcl::test::io { variable msg variable expected +testConstraint ucs2 [expr { [llength [info commands testsize]] && + ([testsize unichar] == 2) }] + testConstraint testchannel [llength [info commands testchannel]] testConstraint exec [llength [info commands exec]] testConstraint openpipe 1 @@ -910,7 +913,7 @@ test io-6.44 {Tcl_GetsObj: input saw cr, not followed by cr} {stdio testchannel close $f set x } [list "bbbbbbbbbbbbbbb" 15 "123456789abcdef" 1 4 "abcd" 0 3 "efg"] -test io-6.45 {Tcl_GetsObj: input saw cr, skip right number of bytes} {stdio testchannel openpipe fileevent} { +test io-6.45 {Tcl_GetsObj: input saw cr, skip right number of bytes} {stdio testchannel openpipe fileevent ucs2} { # Tcl_ExternalToUtf() set f [open "|[list [interpreter] $path(cat)]" w+] diff --git a/tests/source.test b/tests/source.test index dc3c2d8..8511004 100644 --- a/tests/source.test +++ b/tests/source.test @@ -20,6 +20,9 @@ if {[catch {package require tcltest 2.1}]} { namespace eval ::tcl::test::source { namespace import ::tcltest::* +testConstraint ucs2 [expr { [llength [info commands testsize]] && + ([testsize unichar] == 2) }] + test source-1.1 {source command} -setup { set x "old x value" set y "old y value" @@ -232,7 +235,7 @@ test source-7.1 {source -encoding test} -setup { } -cleanup { removeFile source.file } -result correct -test source-7.2 {source -encoding test} -setup { +test source-7.2 {source -encoding test} -constraints ucs2 -setup { # This tests for bad interactions between [source -encoding] # and use of the Control-Z character (\u001A) as a cross-platform # EOF character by [source]. Here we write out and the [source] a -- cgit v0.12 From 4da0e252257e24143039784510363d066545be27 Mon Sep 17 00:00:00 2001 From: dgp Date: Tue, 21 Apr 2020 15:56:34 +0000 Subject: remove merge litter --- tests/util.test | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/util.test b/tests/util.test index a483de1..85c06dd 100644 --- a/tests/util.test +++ b/tests/util.test @@ -15,7 +15,6 @@ if {[lsearch [namespace children] ::tcltest] == -1} { testConstraint testdstring [llength [info commands testdstring]] testConstraint testconcatobj [llength [info commands testconcatobj]] testConstraint testdoubledigits [llength [info commands testdoubledigits]] -testConstraint compat85 [expr {[format %c 0x010000] == "\uFFFD"}] # Big test for correct ordering of data in [expr] -- cgit v0.12 From ce76e24a88d8c6c8abfd5da63402691c072e697b Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Tue, 21 Apr 2020 19:45:34 +0000 Subject: Improve the "testutfnext" command. It can now accept both bytes and strings, and it will test whether src[-1] is read without needing test-variations for it. --- generic/tclTest.c | 48 +++++++------ tests/utf.test | 208 +++++++++++++++++++++++++++--------------------------- 2 files changed, 131 insertions(+), 125 deletions(-) diff --git a/generic/tclTest.c b/generic/tclTest.c index b9fd204..7a531b4 100644 --- a/generic/tclTest.c +++ b/generic/tclTest.c @@ -7113,7 +7113,7 @@ SimpleListVolumes(void) /* * Used to check operations of Tcl_UtfNext. * - * Usage: testutfnext $bytes $offset + * Usage: testutfnext -bytestring $bytes */ static int @@ -7123,37 +7123,43 @@ TestUtfNextCmd( int objc, Tcl_Obj *const objv[]) { - int numBytes, offset = 0; + int numBytes; char *bytes; - const char *result; - Tcl_Obj *copy; + const char *result, *first; + char buffer[32]; + static const char tobetested[] = "\xFF\xFE\xF4\xF2\xF0\xEF\xE8\xE3\xE2\xE1\xE0\xC2\xC1\xC0\x82"; + const char *p = tobetested; + + if (objc != 3 || strcmp(Tcl_GetString(objv[1]), "-bytestring")) { + if (objc != 2) { + Tcl_WrongNumArgs(interp, 1, objv, "?-bytestring? bytes"); + return TCL_ERROR; + } + bytes = Tcl_GetStringFromObj(objv[1], &numBytes); + } else { + bytes = (char *) Tcl_GetByteArrayFromObj(objv[2], &numBytes); + } - if (objc < 2 || objc > 3) { - Tcl_WrongNumArgs(interp, 1, objv, "bytes ?offset?"); + if (numBytes > sizeof(buffer)-2) { + Tcl_AppendResult(interp, "\"testutfnext\" can only handle 30 bytes", NULL); return TCL_ERROR; } - bytes = (char *) Tcl_GetByteArrayFromObj(objv[1], &numBytes); + memcpy(buffer + 1, bytes, numBytes); + buffer[0] = buffer[numBytes + 1] = '\x00'; - if (objc == 3) { - if (TCL_OK != TclGetIntForIndex(interp, objv[2], numBytes, &offset)) { + first = Tcl_UtfNext(buffer + 1); + while ((buffer[0] = *p++) != '\0') { + /* Run Tcl_UtfNext with many more possible bytes at src[-1], all should give the same result */ + result = Tcl_UtfNext(buffer + 1); + if (first != result) { + Tcl_AppendResult(interp, "Tcl_UtfNext is not supposed to read src[-1]", NULL); return TCL_ERROR; } - if (offset < 0) { - offset = 0; - } - if (offset > numBytes) { - offset = numBytes; - } } - copy = Tcl_DuplicateObj(objv[1]); - bytes = (char *) Tcl_SetByteArrayLength(copy, numBytes+1); - bytes[numBytes] = '\0'; - result = Tcl_UtfNext(bytes + offset); - Tcl_SetObjResult(interp, Tcl_NewIntObj(result - bytes)); + Tcl_SetObjResult(interp, Tcl_NewIntObj(result - buffer - 1)); - Tcl_DecrRefCount(copy); return TCL_OK; } /* diff --git a/tests/utf.test b/tests/utf.test index 1c79f32..0a81ae3 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -143,7 +143,7 @@ test utf-5.2 {Tcl_UtfFindLast} {testfindlast testbytestring} { test utf-6.1 {Tcl_UtfNext} testutfnext { # This takes the pointer one past the terminating NUL. # This is really an invalid call. - testutfnext {} + testutfnext -bytestring {} } 1 test utf-6.2 {Tcl_UtfNext} testutfnext { testutfnext A @@ -152,301 +152,301 @@ test utf-6.3 {Tcl_UtfNext} testutfnext { testutfnext AA } 1 test utf-6.4 {Tcl_UtfNext} testutfnext { - testutfnext A\xA0 + testutfnext -bytestring A\xA0 } 1 test utf-6.5 {Tcl_UtfNext} testutfnext { - testutfnext A\xD0 + testutfnext -bytestring A\xD0 } 1 test utf-6.6 {Tcl_UtfNext} testutfnext { - testutfnext A\xE8 + testutfnext -bytestring A\xE8 } 1 test utf-6.7 {Tcl_UtfNext} testutfnext { - testutfnext A\xF2 + testutfnext -bytestring A\xF2 } 1 test utf-6.8 {Tcl_UtfNext} testutfnext { - testutfnext A\xF8 + testutfnext -bytestring A\xF8 } 1 test utf-6.9 {Tcl_UtfNext} testutfnext { - testutfnext \xA0 + testutfnext -bytestring \xA0 } 1 test utf-6.10 {Tcl_UtfNext} testutfnext { - testutfnext \xA0G + testutfnext -bytestring \xA0G } 1 test utf-6.11 {Tcl_UtfNext} testutfnext { - testutfnext \xA0\xA0 + testutfnext -bytestring \xA0\xA0 } 1 test utf-6.12 {Tcl_UtfNext} testutfnext { - testutfnext \xA0\xD0 + testutfnext -bytestring \xA0\xD0 } 1 test utf-6.13 {Tcl_UtfNext} testutfnext { - testutfnext \xA0\xE8 + testutfnext -bytestring \xA0\xE8 } 1 test utf-6.14 {Tcl_UtfNext} testutfnext { - testutfnext \xA0\xF2 + testutfnext -bytestring \xA0\xF2 } 1 test utf-6.15 {Tcl_UtfNext} testutfnext { - testutfnext \xA0\xF8 + testutfnext -bytestring \xA0\xF8 } 1 test utf-6.16 {Tcl_UtfNext} testutfnext { - testutfnext \xD0 + testutfnext -bytestring \xD0 } 1 test utf-6.17 {Tcl_UtfNext} testutfnext { - testutfnext \xD0G + testutfnext -bytestring \xD0G } 1 test utf-6.18 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xA0 + testutfnext -bytestring \xD0\xA0 } 2 test utf-6.19 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xD0 + testutfnext -bytestring \xD0\xD0 } 1 test utf-6.20 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xE8 + testutfnext -bytestring \xD0\xE8 } 1 test utf-6.21 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xF2 + testutfnext -bytestring \xD0\xF2 } 1 test utf-6.22 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xF8 + testutfnext -bytestring \xD0\xF8 } 1 test utf-6.23 {Tcl_UtfNext} testutfnext { - testutfnext \xE8 + testutfnext -bytestring \xE8 } 1 test utf-6.24 {Tcl_UtfNext} testutfnext { - testutfnext \xE8G + testutfnext -bytestring \xE8G } 1 test utf-6.25 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0 + testutfnext -bytestring \xE8\xA0 } 1 test utf-6.26 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xD0 + testutfnext -bytestring \xE8\xD0 } 1 test utf-6.27 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xE8 + testutfnext -bytestring \xE8\xE8 } 1 test utf-6.28 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xF2 + testutfnext -bytestring \xE8\xF2 } 1 test utf-6.29 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xF8 + testutfnext -bytestring \xE8\xF8 } 1 test utf-6.30 {Tcl_UtfNext} testutfnext { - testutfnext \xF2 + testutfnext -bytestring \xF2 } 1 test utf-6.31 {Tcl_UtfNext} testutfnext { - testutfnext \xF2G + testutfnext -bytestring \xF2G } 1 test utf-6.32 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0 + testutfnext -bytestring \xF2\xA0 } 1 test utf-6.33 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xD0 + testutfnext -bytestring \xF2\xD0 } 1 test utf-6.34 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xE8 + testutfnext -bytestring \xF2\xE8 } 1 test utf-6.35 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xF2 + testutfnext -bytestring \xF2\xF2 } 1 test utf-6.36 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xF8 + testutfnext -bytestring \xF2\xF8 } 1 test utf-6.37 {Tcl_UtfNext} testutfnext { - testutfnext \xF8 + testutfnext -bytestring \xF8 } 1 test utf-6.38 {Tcl_UtfNext} testutfnext { - testutfnext \xF8G + testutfnext -bytestring \xF8G } 1 test utf-6.39 {Tcl_UtfNext} testutfnext { - testutfnext \xF8\xA0 + testutfnext -bytestring \xF8\xA0 } 1 test utf-6.40 {Tcl_UtfNext} testutfnext { - testutfnext \xF8\xD0 + testutfnext -bytestring \xF8\xD0 } 1 test utf-6.41 {Tcl_UtfNext} testutfnext { - testutfnext \xF8\xE8 + testutfnext -bytestring \xF8\xE8 } 1 test utf-6.42 {Tcl_UtfNext} testutfnext { - testutfnext \xF8\xF2 + testutfnext -bytestring \xF8\xF2 } 1 test utf-6.43 {Tcl_UtfNext} testutfnext { - testutfnext \xF8\xF8 + testutfnext -bytestring \xF8\xF8 } 1 test utf-6.44 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xA0G + testutfnext -bytestring \xD0\xA0G } 2 test utf-6.45 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xA0\xA0 + testutfnext -bytestring \xD0\xA0\xA0 } 2 test utf-6.46 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xA0\xD0 + testutfnext -bytestring \xD0\xA0\xD0 } 2 test utf-6.47 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xA0\xE8 + testutfnext -bytestring \xD0\xA0\xE8 } 2 test utf-6.48 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xA0\xF2 + testutfnext -bytestring \xD0\xA0\xF2 } 2 test utf-6.49 {Tcl_UtfNext} testutfnext { - testutfnext \xD0\xA0\xF8 + testutfnext -bytestring \xD0\xA0\xF8 } 2 test utf-6.50 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0G + testutfnext -bytestring \xE8\xA0G } 1 test utf-6.51 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xA0 + testutfnext -bytestring \xE8\xA0\xA0 } 3 test utf-6.52 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xD0 + testutfnext -bytestring \xE8\xA0\xD0 } 1 test utf-6.53 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xE8 + testutfnext -bytestring \xE8\xA0\xE8 } 1 test utf-6.54 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xF2 + testutfnext -bytestring \xE8\xA0\xF2 } 1 test utf-6.55 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xF8 + testutfnext -bytestring \xE8\xA0\xF8 } 1 test utf-6.56 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0G + testutfnext -bytestring \xF2\xA0G } 1 test utf-6.57 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0\xA0 + testutfnext -bytestring \xF2\xA0\xA0 } 1 test utf-6.58 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0\xD0 + testutfnext -bytestring \xF2\xA0\xD0 } 1 test utf-6.59 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0\xE8 + testutfnext -bytestring \xF2\xA0\xE8 } 1 test utf-6.60 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0\xF2 + testutfnext -bytestring \xF2\xA0\xF2 } 1 test utf-6.61 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0\xF8 + testutfnext -bytestring \xF2\xA0\xF8 } 1 test utf-6.62 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xA0G + testutfnext -bytestring \xE8\xA0\xA0G } 3 test utf-6.63 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xA0\xA0 + testutfnext -bytestring \xE8\xA0\xA0\xA0 } 3 test utf-6.64 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xA0\xD0 + testutfnext -bytestring \xE8\xA0\xA0\xD0 } 3 test utf-6.65 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xA0\xE8 + testutfnext -bytestring \xE8\xA0\xA0\xE8 } 3 test utf-6.66 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xA0\xF2 + testutfnext -bytestring \xE8\xA0\xA0\xF2 } 3 test utf-6.67 {Tcl_UtfNext} testutfnext { - testutfnext \xE8\xA0\xA0\xF8 + testutfnext -bytestring \xE8\xA0\xA0\xF8 } 3 test utf-6.68 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0\xA0G + testutfnext -bytestring \xF2\xA0\xA0G } 1 test utf-6.69.0 {Tcl_UtfNext} {testutfnext compat85} { - testutfnext \xF2\xA0\xA0\xA0 + testutfnext -bytestring \xF2\xA0\xA0\xA0 } 1 test utf-6.69.1 {Tcl_UtfNext} {testutfnext fullutf} { - testutfnext \xF2\xA0\xA0\xA0 + testutfnext -bytestring \xF2\xA0\xA0\xA0 } 4 test utf-6.70 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0\xA0\xD0 + testutfnext -bytestring \xF2\xA0\xA0\xD0 } 1 test utf-6.71 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0\xA0\xE8 + testutfnext -bytestring \xF2\xA0\xA0\xE8 } 1 test utf-6.71 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0\xA0\xF2 + testutfnext -bytestring \xF2\xA0\xA0\xF2 } 1 test utf-6.73 {Tcl_UtfNext} testutfnext { - testutfnext \xF2\xA0\xA0\xF8 + testutfnext -bytestring \xF2\xA0\xA0\xF8 } 1 test utf-6.74.0 {Tcl_UtfNext} {testutfnext compat85} { - testutfnext \xF2\xA0\xA0\xA0G + testutfnext -bytestring \xF2\xA0\xA0\xA0G } 1 test utf-6.74.1 {Tcl_UtfNext} {testutfnext fullutf} { - testutfnext \xF2\xA0\xA0\xA0G + testutfnext -bytestring \xF2\xA0\xA0\xA0G } 4 test utf-6.75.0 {Tcl_UtfNext} {testutfnext compat85} { - testutfnext \xF2\xA0\xA0\xA0\xA0 + testutfnext -bytestring \xF2\xA0\xA0\xA0\xA0 } 1 test utf-6.75.1 {Tcl_UtfNext} {testutfnext fullutf} { - testutfnext \xF2\xA0\xA0\xA0\xA0 + testutfnext -bytestring \xF2\xA0\xA0\xA0\xA0 } 4 test utf-6.76.0 {Tcl_UtfNext} {testutfnext compat85} { - testutfnext \xF2\xA0\xA0\xA0\xD0 + testutfnext -bytestring \xF2\xA0\xA0\xA0\xD0 } 1 test utf-6.76.1 {Tcl_UtfNext} {testutfnext fullutf} { - testutfnext \xF2\xA0\xA0\xA0\xD0 + testutfnext -bytestring \xF2\xA0\xA0\xA0\xD0 } 4 test utf-6.77.0 {Tcl_UtfNext} {testutfnext compat85} { - testutfnext \xF2\xA0\xA0\xA0\xE8 + testutfnext -bytestring \xF2\xA0\xA0\xA0\xE8 } 1 test utf-6.77.1 {Tcl_UtfNext} {testutfnext fullutf} { - testutfnext \xF2\xA0\xA0\xA0\xE8 + testutfnext -bytestring \xF2\xA0\xA0\xA0\xE8 } 4 test utf-6.78.0 {Tcl_UtfNext} {testutfnext compat85} { - testutfnext \xF2\xA0\xA0\xA0\xF2 + testutfnext -bytestring \xF2\xA0\xA0\xA0\xF2 } 1 test utf-6.78.1 {Tcl_UtfNext} {testutfnext fullutf} { - testutfnext \xF2\xA0\xA0\xA0\xF2 + testutfnext -bytestring \xF2\xA0\xA0\xA0\xF2 } 4 test utf-6.79.0 {Tcl_UtfNext} {testutfnext compat85} { - testutfnext \xF2\xA0\xA0\xA0G\xF8 + testutfnext -bytestring \xF2\xA0\xA0\xA0G\xF8 } 1 test utf-6.79.1 {Tcl_UtfNext} {testutfnext fullutf} { - testutfnext \xF2\xA0\xA0\xA0G\xF8 + testutfnext -bytestring \xF2\xA0\xA0\xA0G\xF8 } 4 test utf-6.80 {Tcl_UtfNext - overlong sequences} testutfnext { - testutfnext \xC0\x80 + testutfnext -bytestring \xC0\x80 } 2 test utf-6.81 {Tcl_UtfNext - overlong sequences} testutfnext { - testutfnext \xC0\x81 + testutfnext -bytestring \xC0\x81 } 1 test utf-6.82 {Tcl_UtfNext - overlong sequences} testutfnext { - testutfnext \xC1\x80 + testutfnext -bytestring \xC1\x80 } 1 test utf-6.83 {Tcl_UtfNext - overlong sequences} testutfnext { - testutfnext \xC2\x80 + testutfnext -bytestring \xC2\x80 } 2 test utf-6.84 {Tcl_UtfNext - overlong sequences} testutfnext { - testutfnext \xE0\x80\x80 + testutfnext -bytestring \xE0\x80\x80 } 1 test utf-6.85 {Tcl_UtfNext - overlong sequences} testutfnext { - testutfnext \xE0\xA0\x80 + testutfnext -bytestring \xE0\xA0\x80 } 3 test utf-6.86 {Tcl_UtfNext - overlong sequences} testutfnext { - testutfnext \xF0\x80\x80\x80 + testutfnext -bytestring \xF0\x80\x80\x80 } 1 test utf-6.87.0 {Tcl_UtfNext - overlong sequences} {testutfnext compat85} { - testutfnext \xF0\x90\x80\x80 + testutfnext -bytestring \xF0\x90\x80\x80 } 1 test utf-6.87.1 {Tcl_UtfNext - overlong sequences} {testutfnext fullutf} { - testutfnext \xF0\x90\x80\x80 + testutfnext -bytestring \xF0\x90\x80\x80 } 4 test utf-6.88 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} { - testutfnext \xA0\xA0 + testutfnext -bytestring \xA0\xA0 } 1 -test utf-6.88.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte valid sequence} {testutfnext} { - testutfnext \xE8\xA0\xA0 1 -} 2 test utf-6.89 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext} { - testutfnext \x80\x80 + testutfnext -bytestring \x80\x80 } 1 -test utf-6.89.1 {Tcl_UtfNext, pointing to 2th byte of 3-byte invalid sequence} {testutfnext} { - testutfnext \xF0\x80\x80 1 -} 2 test utf-6.90.0 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext compat85} { - testutfnext \xF4\x8F\xBF\xBF + testutfnext -bytestring \xF4\x8F\xBF\xBF } 1 test utf-6.90.1 {Tcl_UtfNext, validity check [493dccc2de]} {testutfnext fullutf} { - testutfnext \xF4\x8F\xBF\xBF + testutfnext -bytestring \xF4\x8F\xBF\xBF } 4 test utf-6.91 {Tcl_UtfNext, validity check [493dccc2de]} testutfnext { - testutfnext \xF4\x90\x80\x80 + testutfnext -bytestring \xF4\x90\x80\x80 +} 1 +test utf-6.92 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} testutfnext { + testutfnext -bytestring \xA0\xA0\xA0 +} 1 +test utf-6.93 {Tcl_UtfNext, pointing to 2th byte of 4-byte invalid sequence} testutfnext { + testutfnext -bytestring \x80\x80\x80 } 1 test utf-7.1 {Tcl_UtfPrev} testutfprev { -- cgit v0.12