From df203baa1f787de574237d71c3df4491edc0dae4 Mon Sep 17 00:00:00 2001 From: "jan.nijtmans" Date: Fri, 11 Jul 2014 10:38:55 +0000 Subject: Starting with Unicode 6.3, the mongolian vowel separator (U+180e) is no longer a whitespace, but for Tcl it still is. "NEL/Next Line" (U+0085) should have been a Unicode whitespace, but never was in Tcl. This is corrected in Tcl 8.6, but for legacy reasons not in Tcl 8.5. Update documentation accordingly, and extend test-cases for Unicode 7 compliance. --- doc/string.n | 3 ++- generic/regc_locale.c | 4 ++-- tests/utf.test | 30 +++++++++++++++--------------- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/doc/string.n b/doc/string.n index f39d57c..7e427ab 100644 --- a/doc/string.n +++ b/doc/string.n @@ -161,7 +161,8 @@ Any Unicode printing character, including space. .IP \fBpunct\fR 12 Any Unicode punctuation character. .IP \fBspace\fR 12 -Any Unicode space character. +Any Unicode whitespace character or mongolian vowel separator (U+180e), +but not NEL/Next Line (U+0085). .IP \fBtrue\fR 12 Any of the forms allowed to \fBTcl_GetBoolean\fR where the value is true. diff --git a/generic/regc_locale.c b/generic/regc_locale.c index 8dba520..e056078 100644 --- a/generic/regc_locale.c +++ b/generic/regc_locale.c @@ -381,8 +381,8 @@ static const crange spaceRangeTable[] = { #define NUM_SPACE_RANGE (sizeof(spaceRangeTable)/sizeof(crange)) static const chr spaceCharTable[] = { - 0x20, 0xa0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f, 0x2060, - 0x3000, 0xfeff + 0x20, 0xa0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f, + 0x3000 }; #define NUM_SPACE_CHAR (sizeof(spaceCharTable)/sizeof(chr)) diff --git a/tests/utf.test b/tests/utf.test index 35c5f73..30200c1 100644 --- a/tests/utf.test +++ b/tests/utf.test @@ -278,15 +278,15 @@ test utf-20.1 {TclUniCharNcmp} { } {} test utf-21.1 {TclUniCharIsAlnum} { - # this returns 1 with Unicode 6 compliance + # this returns 1 with Unicode 7 compliance string is alnum \u1040\u021f\u0220 } {1} test utf-21.2 {unicode alnum char in regc_locale.c} { - # this returns 1 with Unicode 6 compliance + # this returns 1 with Unicode 7 compliance list [regexp {^[[:alnum:]]+$} \u1040\u021f\u0220] [regexp {^\w+$} \u1040\u021f\u0220] } {1 1} test utf-21.3 {unicode print char in regc_locale.c} { - # this returns 1 with Unicode 6 compliance + # this returns 1 with Unicode 7 compliance regexp {^[[:print:]]+$} \ufbc1 } 1 test utf-21.4 {TclUniCharIsGraph} { @@ -319,11 +319,11 @@ test utf-21.10 {unicode print char in regc_locale.c} { } {0} test utf-21.11 {TclUniCharIsControl} { # [Bug 3464428] - string is control \u00ad + string is control \u0000\u001f\u00ad\u0605\u061c\u180e\u2066\ufeff } {1} test utf-21.12 {unicode control char in regc_locale.c} { # [Bug 3464428], [Bug a876646efe] - regexp {^[[:cntrl:]]*$} \u0000\u001f\u00ad + regexp {^[[:cntrl:]]*$} \u0000\u001f\u00ad\u0605\u061c\u180e\u2066\ufeff } {1} test utf-22.1 {TclUniCharIsWordChar} { @@ -334,30 +334,30 @@ test utf-22.2 {TclUniCharIsWordChar} { } 10 test utf-23.1 {TclUniCharIsAlpha} { - # this returns 1 with Unicode 6 compliance - string is alpha \u021f\u0220 + # this returns 1 with Unicode 7 compliance + string is alpha \u021f\u0220\u037f\u052f } {1} test utf-23.2 {unicode alpha char in regc_locale.c} { - # this returns 1 with Unicode 6 compliance - regexp {^[[:alpha:]]+$} \u021f\u0220 + # this returns 1 with Unicode 7 compliance + regexp {^[[:alpha:]]+$} \u021f\u0220\u037f\u052f } {1} test utf-24.1 {TclUniCharIsDigit} { - # this returns 1 with Unicode 6 compliance + # this returns 1 with Unicode 7 compliance string is digit \u1040\uabf0 } {1} test utf-24.2 {unicode digit char in regc_locale.c} { - # this returns 1 with Unicode 6 compliance + # this returns 1 with Unicode 7 compliance list [regexp {^[[:digit:]]+$} \u1040\uabf0] [regexp {^\d+$} \u1040\uabf0] } {1 1} test utf-24.3 {TclUniCharIsSpace} { - # this returns 1 with Unicode 6 compliance - string is space \u1680\u180e + # this returns 1 with Unicode 7 compliance + string is space \u1680\u180e\u202f } {1} test utf-24.4 {unicode space char in regc_locale.c} { - # this returns 1 with Unicode 6 compliance - list [regexp {^[[:space:]]+$} \u1680\u180e] [regexp {^\s+$} \u1680\u180e] + # this returns 1 with Unicode 7 compliance + list [regexp {^[[:space:]]+$} \u1680\u180e\u202f] [regexp {^\s+$} \u1680\u180e\u202f] } {1 1} testConstraint teststringobj [llength [info commands teststringobj]] -- cgit v0.12